Tracing owner callstack in `contention_begin()` and `contention_end()`,
and storing in `owner_lock_stat` bpf map.
Signed-off-by: Chun-Tse Shao <ctshao@google.com>
---
.../perf/util/bpf_skel/lock_contention.bpf.c | 152 +++++++++++++++++-
1 file changed, 151 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 05da19fdab23..3f47fbfa237c 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -7,6 +7,7 @@
#include <asm-generic/errno-base.h>
#include "lock_data.h"
+#include <time.h>
/* for collect_lock_syms(). 4096 was rejected by the verifier */
#define MAX_CPUS 1024
@@ -178,6 +179,9 @@ int data_fail;
int task_map_full;
int data_map_full;
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
static inline __u64 get_current_cgroup_id(void)
{
struct task_struct *task;
@@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
pelem->flags = (__u32)ctx[1];
if (needs_callstack) {
+ u32 i = 0;
+ int owner_pid;
+ unsigned long *entries;
+ struct task_struct *task;
+ cotd *data;
+
+ if (!lock_owner)
+ goto contention_begin_skip_owner_callstack;
+
+ task = get_lock_owner(pelem->lock, pelem->flags);
+ if (!task)
+ goto contention_begin_skip_owner_callstack;
+
+ owner_pid = BPF_CORE_READ(task, pid);
+
+ entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
+ if (!entries)
+ goto contention_begin_skip_owner_callstack;
+ for (i = 0; i < max_stack; i++)
+ entries[i] = 0x0;
+
+ task = bpf_task_from_pid(owner_pid);
+ if (task) {
+ bpf_get_task_stack(task, entries,
+ max_stack * sizeof(unsigned long),
+ 0);
+ bpf_task_release(task);
+ }
+
+ data = bpf_map_lookup_elem(&contention_owner_tracing,
+ &(pelem->lock));
+
+ // Contention just happens, or corner case `lock` is owned by
+ // process not `owner_pid`.
+ if (!data || data->pid != owner_pid) {
+ cotd first = {
+ .pid = owner_pid,
+ .timestamp = pelem->timestamp,
+ .count = 1,
+ };
+ bpf_map_update_elem(&contention_owner_tracing,
+ &(pelem->lock), &first, BPF_ANY);
+ bpf_map_update_elem(&contention_owner_stacks,
+ &(pelem->lock), entries, BPF_ANY);
+ }
+ // Contention is going on and new waiter joins.
+ else {
+ __sync_fetch_and_add(&data->count, 1);
+ // TODO: Since for owner the callstack would change at
+ // different time, We should check and report if the
+ // callstack is different with the recorded one in
+ // `contention_owner_stacks`.
+ }
+contention_begin_skip_owner_callstack:
pelem->stack_id = bpf_get_stackid(ctx, &stacks,
BPF_F_FAST_STACK_CMP | stack_skip);
if (pelem->stack_id < 0)
@@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
struct tstamp_data *pelem;
struct contention_key key = {};
struct contention_data *data;
+ __u64 timestamp;
__u64 duration;
bool need_delete = false;
@@ -469,12 +528,103 @@ int contention_end(u64 *ctx)
return 0;
need_delete = true;
}
- duration = bpf_ktime_get_ns() - pelem->timestamp;
+ timestamp = bpf_ktime_get_ns();
+ duration = timestamp - pelem->timestamp;
if ((__s64)duration < 0) {
__sync_fetch_and_add(&time_fail, 1);
goto out;
}
+ if (needs_callstack && lock_owner) {
+ u64 owner_contention_time;
+ unsigned long *owner_stack;
+ struct contention_data *cdata;
+ cotd *otdata;
+
+ otdata = bpf_map_lookup_elem(&contention_owner_tracing,
+ &(pelem->lock));
+ owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
+ &(pelem->lock));
+ if (!otdata || !owner_stack)
+ goto contention_end_skip_owner_callstack;
+
+ owner_contention_time = timestamp - otdata->timestamp;
+
+ // Update `owner_lock_stat` if `owner_stack` is
+ // available.
+ if (owner_stack[0] != 0x0) {
+ cdata = bpf_map_lookup_elem(&owner_lock_stat,
+ owner_stack);
+ if (!cdata) {
+ struct contention_data first = {
+ .total_time = owner_contention_time,
+ .max_time = owner_contention_time,
+ .min_time = owner_contention_time,
+ .count = 1,
+ .flags = pelem->flags,
+ };
+ bpf_map_update_elem(&owner_lock_stat,
+ owner_stack, &first,
+ BPF_ANY);
+ } else {
+ __sync_fetch_and_add(&cdata->total_time,
+ owner_contention_time);
+ __sync_fetch_and_add(&cdata->count, 1);
+
+ /* FIXME: need atomic operations */
+ if (cdata->max_time < owner_contention_time)
+ cdata->max_time = owner_contention_time;
+ if (cdata->min_time > owner_contention_time)
+ cdata->min_time = owner_contention_time;
+ }
+ }
+
+ // No contention is going on, delete `lock` in
+ // `contention_owner_tracing` and
+ // `contention_owner_stacks`
+ if (otdata->count <= 1) {
+ bpf_map_delete_elem(&contention_owner_tracing,
+ &(pelem->lock));
+ bpf_map_delete_elem(&contention_owner_stacks,
+ &(pelem->lock));
+ }
+ // Contention is still going on, with a new owner
+ // (current task). `otdata` should be updated accordingly.
+ else {
+ (otdata->count)--;
+
+ // If ctx[1] is not 0, the current task terminates lock
+ // waiting without acquiring it. Owner is not changed.
+ if (ctx[1] == 0) {
+ u32 i = 0;
+ unsigned long *entries = bpf_map_lookup_elem(
+ &owner_stacks_entries, &i);
+ if (entries) {
+ for (i = 0; i < (u32)max_stack; i++)
+ entries[i] = 0x0;
+
+ bpf_get_task_stack(
+ bpf_get_current_task_btf(),
+ entries,
+ max_stack *
+ sizeof(unsigned long),
+ 0);
+ bpf_map_update_elem(
+ &contention_owner_stacks,
+ &(pelem->lock), entries,
+ BPF_ANY);
+ }
+
+ otdata->pid = pid;
+ otdata->timestamp = timestamp;
+ }
+
+ bpf_map_update_elem(&contention_owner_tracing,
+ &(pelem->lock), otdata, BPF_ANY);
+ }
+ }
+contention_end_skip_owner_callstack:
+
switch (aggr_mode) {
case LOCK_AGGR_CALLER:
key.stack_id = pelem->stack_id;
--
2.47.1.688.g23fc6f90ad-goog
On Sun, Jan 12, 2025 at 09:20:15PM -0800, Chun-Tse Shao wrote:
> Tracing owner callstack in `contention_begin()` and `contention_end()`,
> and storing in `owner_lock_stat` bpf map.
>
> Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> ---
> .../perf/util/bpf_skel/lock_contention.bpf.c | 152 +++++++++++++++++-
> 1 file changed, 151 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> index 05da19fdab23..3f47fbfa237c 100644
> --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> @@ -7,6 +7,7 @@
> #include <asm-generic/errno-base.h>
>
> #include "lock_data.h"
> +#include <time.h>
>
> /* for collect_lock_syms(). 4096 was rejected by the verifier */
> #define MAX_CPUS 1024
> @@ -178,6 +179,9 @@ int data_fail;
> int task_map_full;
> int data_map_full;
>
> +struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
> +void bpf_task_release(struct task_struct *p) __ksym;
To support old (ancient?) kernels, you can declare them as __weak and
check if one of them is defined and ignore owner stacks on them. Also
you can check them in user space and turn off the option before loading.
> +
> static inline __u64 get_current_cgroup_id(void)
> {
> struct task_struct *task;
> @@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
> pelem->flags = (__u32)ctx[1];
>
> if (needs_callstack) {
> + u32 i = 0;
> + int owner_pid;
> + unsigned long *entries;
> + struct task_struct *task;
> + cotd *data;
> +
> + if (!lock_owner)
> + goto contention_begin_skip_owner_callstack;
Can be it 'skip_owner'?
> +
> + task = get_lock_owner(pelem->lock, pelem->flags);
> + if (!task)
> + goto contention_begin_skip_owner_callstack;
> +
> + owner_pid = BPF_CORE_READ(task, pid);
> +
> + entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
> + if (!entries)
> + goto contention_begin_skip_owner_callstack;
> + for (i = 0; i < max_stack; i++)
> + entries[i] = 0x0;
> +
> + task = bpf_task_from_pid(owner_pid);
> + if (task) {
> + bpf_get_task_stack(task, entries,
> + max_stack * sizeof(unsigned long),
> + 0);
> + bpf_task_release(task);
> + }
> +
> + data = bpf_map_lookup_elem(&contention_owner_tracing,
> + &(pelem->lock));
No need for parenthesis.
> +
> + // Contention just happens, or corner case `lock` is owned by
> + // process not `owner_pid`.
> + if (!data || data->pid != owner_pid) {
> + cotd first = {
> + .pid = owner_pid,
> + .timestamp = pelem->timestamp,
> + .count = 1,
> + };
> + bpf_map_update_elem(&contention_owner_tracing,
> + &(pelem->lock), &first, BPF_ANY);
> + bpf_map_update_elem(&contention_owner_stacks,
> + &(pelem->lock), entries, BPF_ANY);
Hmm.. it just discard the old owner data if it comes from a new owner?
Why not save the data into the result for the old lock/callstack?
> + }
> + // Contention is going on and new waiter joins.
> + else {
> + __sync_fetch_and_add(&data->count, 1);
> + // TODO: Since for owner the callstack would change at
> + // different time, We should check and report if the
> + // callstack is different with the recorded one in
> + // `contention_owner_stacks`.
> + }
> +contention_begin_skip_owner_callstack:
> pelem->stack_id = bpf_get_stackid(ctx, &stacks,
> BPF_F_FAST_STACK_CMP | stack_skip);
> if (pelem->stack_id < 0)
> @@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
> struct tstamp_data *pelem;
> struct contention_key key = {};
> struct contention_data *data;
> + __u64 timestamp;
> __u64 duration;
> bool need_delete = false;
>
> @@ -469,12 +528,103 @@ int contention_end(u64 *ctx)
> return 0;
> need_delete = true;
> }
> - duration = bpf_ktime_get_ns() - pelem->timestamp;
> + timestamp = bpf_ktime_get_ns();
> + duration = timestamp - pelem->timestamp;
> if ((__s64)duration < 0) {
> __sync_fetch_and_add(&time_fail, 1);
> goto out;
> }
>
> + if (needs_callstack && lock_owner) {
> + u64 owner_contention_time;
> + unsigned long *owner_stack;
> + struct contention_data *cdata;
> + cotd *otdata;
> +
> + otdata = bpf_map_lookup_elem(&contention_owner_tracing,
> + &(pelem->lock));
> + owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
> + &(pelem->lock));
> + if (!otdata || !owner_stack)
> + goto contention_end_skip_owner_callstack;
> +
> + owner_contention_time = timestamp - otdata->timestamp;
> +
> + // Update `owner_lock_stat` if `owner_stack` is
> + // available.
> + if (owner_stack[0] != 0x0) {
> + cdata = bpf_map_lookup_elem(&owner_lock_stat,
> + owner_stack);
> + if (!cdata) {
> + struct contention_data first = {
> + .total_time = owner_contention_time,
> + .max_time = owner_contention_time,
> + .min_time = owner_contention_time,
> + .count = 1,
> + .flags = pelem->flags,
> + };
> + bpf_map_update_elem(&owner_lock_stat,
> + owner_stack, &first,
> + BPF_ANY);
> + } else {
> + __sync_fetch_and_add(&cdata->total_time,
> + owner_contention_time);
> + __sync_fetch_and_add(&cdata->count, 1);
> +
> + /* FIXME: need atomic operations */
> + if (cdata->max_time < owner_contention_time)
> + cdata->max_time = owner_contention_time;
> + if (cdata->min_time > owner_contention_time)
> + cdata->min_time = owner_contention_time;
> + }
> + }
> +
> + // No contention is going on, delete `lock` in
> + // `contention_owner_tracing` and
> + // `contention_owner_stacks`
> + if (otdata->count <= 1) {
> + bpf_map_delete_elem(&contention_owner_tracing,
> + &(pelem->lock));
> + bpf_map_delete_elem(&contention_owner_stacks,
> + &(pelem->lock));
> + }
> + // Contention is still going on, with a new owner
> + // (current task). `otdata` should be updated accordingly.
> + else {
> + (otdata->count)--;
No need for parenthesis, and it needs to be atomic dec.
> +
> + // If ctx[1] is not 0, the current task terminates lock
> + // waiting without acquiring it. Owner is not changed.
Please add a comment that ctx[1] has the return code of the lock
function. Maybe it's better to use a local variable.
Also I think you need to say about the normal case too. Returing 0
means the waiter now gets the lock and becomes a new owner. So it needs
to update the owner information.
> + if (ctx[1] == 0) {
> + u32 i = 0;
> + unsigned long *entries = bpf_map_lookup_elem(
> + &owner_stacks_entries, &i);
> + if (entries) {
> + for (i = 0; i < (u32)max_stack; i++)
> + entries[i] = 0x0;
> +
> + bpf_get_task_stack(
> + bpf_get_current_task_btf(),
Same as bpf_get_stack(), right?
> + entries,
> + max_stack *
> + sizeof(unsigned long),
> + 0);
> + bpf_map_update_elem(
> + &contention_owner_stacks,
> + &(pelem->lock), entries,
> + BPF_ANY);
Please factor out the code if it indents too much. Or you can use goto
or something to reduce the indentation level.
if (ret != 0)
goto skip_update;
...
if (entries == NULL)
goto skip_stack;
...
Thanks,
Namhyung
> + }
> +
> + otdata->pid = pid;
> + otdata->timestamp = timestamp;
> + }
> +
> + bpf_map_update_elem(&contention_owner_tracing,
> + &(pelem->lock), otdata, BPF_ANY);
> + }
> + }
> +contention_end_skip_owner_callstack:
> +
> switch (aggr_mode) {
> case LOCK_AGGR_CALLER:
> key.stack_id = pelem->stack_id;
> --
> 2.47.1.688.g23fc6f90ad-goog
>
On Mon, Jan 13, 2025 at 7:35 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Sun, Jan 12, 2025 at 09:20:15PM -0800, Chun-Tse Shao wrote:
> > Tracing owner callstack in `contention_begin()` and `contention_end()`,
> > and storing in `owner_lock_stat` bpf map.
> >
> > Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> > ---
> > .../perf/util/bpf_skel/lock_contention.bpf.c | 152 +++++++++++++++++-
> > 1 file changed, 151 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > index 05da19fdab23..3f47fbfa237c 100644
> > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > @@ -7,6 +7,7 @@
> > #include <asm-generic/errno-base.h>
> >
> > #include "lock_data.h"
> > +#include <time.h>
> >
> > /* for collect_lock_syms(). 4096 was rejected by the verifier */
> > #define MAX_CPUS 1024
> > @@ -178,6 +179,9 @@ int data_fail;
> > int task_map_full;
> > int data_map_full;
> >
> > +struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
> > +void bpf_task_release(struct task_struct *p) __ksym;
>
> To support old (ancient?) kernels, you can declare them as __weak and
> check if one of them is defined and ignore owner stacks on them. Also
> you can check them in user space and turn off the option before loading.
>
> > +
> > static inline __u64 get_current_cgroup_id(void)
> > {
> > struct task_struct *task;
> > @@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
> > pelem->flags = (__u32)ctx[1];
> >
> > if (needs_callstack) {
> > + u32 i = 0;
> > + int owner_pid;
> > + unsigned long *entries;
> > + struct task_struct *task;
> > + cotd *data;
> > +
> > + if (!lock_owner)
> > + goto contention_begin_skip_owner_callstack;
>
> Can be it 'skip_owner'?
>
> > +
> > + task = get_lock_owner(pelem->lock, pelem->flags);
> > + if (!task)
> > + goto contention_begin_skip_owner_callstack;
> > +
> > + owner_pid = BPF_CORE_READ(task, pid);
> > +
> > + entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
> > + if (!entries)
> > + goto contention_begin_skip_owner_callstack;
> > + for (i = 0; i < max_stack; i++)
> > + entries[i] = 0x0;
> > +
> > + task = bpf_task_from_pid(owner_pid);
> > + if (task) {
> > + bpf_get_task_stack(task, entries,
> > + max_stack * sizeof(unsigned long),
> > + 0);
> > + bpf_task_release(task);
> > + }
> > +
> > + data = bpf_map_lookup_elem(&contention_owner_tracing,
> > + &(pelem->lock));
>
> No need for parenthesis.
>
> > +
> > + // Contention just happens, or corner case `lock` is owned by
> > + // process not `owner_pid`.
> > + if (!data || data->pid != owner_pid) {
> > + cotd first = {
> > + .pid = owner_pid,
> > + .timestamp = pelem->timestamp,
> > + .count = 1,
> > + };
> > + bpf_map_update_elem(&contention_owner_tracing,
> > + &(pelem->lock), &first, BPF_ANY);
> > + bpf_map_update_elem(&contention_owner_stacks,
> > + &(pelem->lock), entries, BPF_ANY);
>
> Hmm.. it just discard the old owner data if it comes from a new owner?
> Why not save the data into the result for the old lock/callstack?
There are two conditions which enter this if statement:
1. (!data) contention just started, `&pelem->lock` entry in
`contetion_owner_tracing` is empty.
2. (data->pid != owner_pid) Some internal errors so `data->pid` is
misaligned with `owner_pid`. In this case the timestamp would be
incorrect so I prefer to drop it.
WDYT?
>
>
> > + }
> > + // Contention is going on and new waiter joins.
> > + else {
> > + __sync_fetch_and_add(&data->count, 1);
> > + // TODO: Since for owner the callstack would change at
> > + // different time, We should check and report if the
> > + // callstack is different with the recorded one in
> > + // `contention_owner_stacks`.
> > + }
> > +contention_begin_skip_owner_callstack:
> > pelem->stack_id = bpf_get_stackid(ctx, &stacks,
> > BPF_F_FAST_STACK_CMP | stack_skip);
> > if (pelem->stack_id < 0)
> > @@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
> > struct tstamp_data *pelem;
> > struct contention_key key = {};
> > struct contention_data *data;
> > + __u64 timestamp;
> > __u64 duration;
> > bool need_delete = false;
> >
> > @@ -469,12 +528,103 @@ int contention_end(u64 *ctx)
> > return 0;
> > need_delete = true;
> > }
> > - duration = bpf_ktime_get_ns() - pelem->timestamp;
> > + timestamp = bpf_ktime_get_ns();
> > + duration = timestamp - pelem->timestamp;
> > if ((__s64)duration < 0) {
> > __sync_fetch_and_add(&time_fail, 1);
> > goto out;
> > }
> >
> > + if (needs_callstack && lock_owner) {
> > + u64 owner_contention_time;
> > + unsigned long *owner_stack;
> > + struct contention_data *cdata;
> > + cotd *otdata;
> > +
> > + otdata = bpf_map_lookup_elem(&contention_owner_tracing,
> > + &(pelem->lock));
> > + owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
> > + &(pelem->lock));
> > + if (!otdata || !owner_stack)
> > + goto contention_end_skip_owner_callstack;
> > +
> > + owner_contention_time = timestamp - otdata->timestamp;
> > +
> > + // Update `owner_lock_stat` if `owner_stack` is
> > + // available.
> > + if (owner_stack[0] != 0x0) {
> > + cdata = bpf_map_lookup_elem(&owner_lock_stat,
> > + owner_stack);
> > + if (!cdata) {
> > + struct contention_data first = {
> > + .total_time = owner_contention_time,
> > + .max_time = owner_contention_time,
> > + .min_time = owner_contention_time,
> > + .count = 1,
> > + .flags = pelem->flags,
> > + };
> > + bpf_map_update_elem(&owner_lock_stat,
> > + owner_stack, &first,
> > + BPF_ANY);
> > + } else {
> > + __sync_fetch_and_add(&cdata->total_time,
> > + owner_contention_time);
> > + __sync_fetch_and_add(&cdata->count, 1);
> > +
> > + /* FIXME: need atomic operations */
> > + if (cdata->max_time < owner_contention_time)
> > + cdata->max_time = owner_contention_time;
> > + if (cdata->min_time > owner_contention_time)
> > + cdata->min_time = owner_contention_time;
> > + }
> > + }
> > +
> > + // No contention is going on, delete `lock` in
> > + // `contention_owner_tracing` and
> > + // `contention_owner_stacks`
> > + if (otdata->count <= 1) {
> > + bpf_map_delete_elem(&contention_owner_tracing,
> > + &(pelem->lock));
> > + bpf_map_delete_elem(&contention_owner_stacks,
> > + &(pelem->lock));
> > + }
> > + // Contention is still going on, with a new owner
> > + // (current task). `otdata` should be updated accordingly.
> > + else {
> > + (otdata->count)--;
>
> No need for parenthesis, and it needs to be atomic dec.
>
> > +
> > + // If ctx[1] is not 0, the current task terminates lock
> > + // waiting without acquiring it. Owner is not changed.
>
> Please add a comment that ctx[1] has the return code of the lock
> function. Maybe it's better to use a local variable.
>
> Also I think you need to say about the normal case too. Returing 0
> means the waiter now gets the lock and becomes a new owner. So it needs
> to update the owner information.
>
>
> > + if (ctx[1] == 0) {
> > + u32 i = 0;
> > + unsigned long *entries = bpf_map_lookup_elem(
> > + &owner_stacks_entries, &i);
> > + if (entries) {
> > + for (i = 0; i < (u32)max_stack; i++)
> > + entries[i] = 0x0;
> > +
> > + bpf_get_task_stack(
> > + bpf_get_current_task_btf(),
>
> Same as bpf_get_stack(), right?
>
> > + entries,
> > + max_stack *
> > + sizeof(unsigned long),
> > + 0);
> > + bpf_map_update_elem(
> > + &contention_owner_stacks,
> > + &(pelem->lock), entries,
> > + BPF_ANY);
>
> Please factor out the code if it indents too much. Or you can use goto
> or something to reduce the indentation level.
I will reindent it with `ColumnLimit=100`. I was using 80 since it was
predefined in `.clang-format`, looks outdated but no one updated it..
>
> if (ret != 0)
> goto skip_update;
>
> ...
>
> if (entries == NULL)
> goto skip_stack;
>
> ...
>
> Thanks,
> Namhyung
>
> > + }
> > +
> > + otdata->pid = pid;
> > + otdata->timestamp = timestamp;
> > + }
> > +
> > + bpf_map_update_elem(&contention_owner_tracing,
> > + &(pelem->lock), otdata, BPF_ANY);
> > + }
> > + }
> > +contention_end_skip_owner_callstack:
> > +
> > switch (aggr_mode) {
> > case LOCK_AGGR_CALLER:
> > key.stack_id = pelem->stack_id;
> > --
> > 2.47.1.688.g23fc6f90ad-goog
> >
On Tue, Jan 21, 2025 at 02:35:46PM -0800, Chun-Tse Shao wrote:
> On Mon, Jan 13, 2025 at 7:35 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Sun, Jan 12, 2025 at 09:20:15PM -0800, Chun-Tse Shao wrote:
> > > Tracing owner callstack in `contention_begin()` and `contention_end()`,
> > > and storing in `owner_lock_stat` bpf map.
> > >
> > > Signed-off-by: Chun-Tse Shao <ctshao@google.com>
> > > ---
> > > .../perf/util/bpf_skel/lock_contention.bpf.c | 152 +++++++++++++++++-
> > > 1 file changed, 151 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > index 05da19fdab23..3f47fbfa237c 100644
> > > --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
> > > @@ -7,6 +7,7 @@
> > > #include <asm-generic/errno-base.h>
> > >
> > > #include "lock_data.h"
> > > +#include <time.h>
> > >
> > > /* for collect_lock_syms(). 4096 was rejected by the verifier */
> > > #define MAX_CPUS 1024
> > > @@ -178,6 +179,9 @@ int data_fail;
> > > int task_map_full;
> > > int data_map_full;
> > >
> > > +struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
> > > +void bpf_task_release(struct task_struct *p) __ksym;
> >
> > To support old (ancient?) kernels, you can declare them as __weak and
> > check if one of them is defined and ignore owner stacks on them. Also
> > you can check them in user space and turn off the option before loading.
> >
> > > +
> > > static inline __u64 get_current_cgroup_id(void)
> > > {
> > > struct task_struct *task;
> > > @@ -407,6 +411,60 @@ int contention_begin(u64 *ctx)
> > > pelem->flags = (__u32)ctx[1];
> > >
> > > if (needs_callstack) {
> > > + u32 i = 0;
> > > + int owner_pid;
> > > + unsigned long *entries;
> > > + struct task_struct *task;
> > > + cotd *data;
> > > +
> > > + if (!lock_owner)
> > > + goto contention_begin_skip_owner_callstack;
> >
> > Can be it 'skip_owner'?
> >
> > > +
> > > + task = get_lock_owner(pelem->lock, pelem->flags);
> > > + if (!task)
> > > + goto contention_begin_skip_owner_callstack;
> > > +
> > > + owner_pid = BPF_CORE_READ(task, pid);
> > > +
> > > + entries = bpf_map_lookup_elem(&owner_stacks_entries, &i);
> > > + if (!entries)
> > > + goto contention_begin_skip_owner_callstack;
> > > + for (i = 0; i < max_stack; i++)
> > > + entries[i] = 0x0;
> > > +
> > > + task = bpf_task_from_pid(owner_pid);
> > > + if (task) {
> > > + bpf_get_task_stack(task, entries,
> > > + max_stack * sizeof(unsigned long),
> > > + 0);
> > > + bpf_task_release(task);
> > > + }
> > > +
> > > + data = bpf_map_lookup_elem(&contention_owner_tracing,
> > > + &(pelem->lock));
> >
> > No need for parenthesis.
> >
> > > +
> > > + // Contention just happens, or corner case `lock` is owned by
> > > + // process not `owner_pid`.
> > > + if (!data || data->pid != owner_pid) {
> > > + cotd first = {
> > > + .pid = owner_pid,
> > > + .timestamp = pelem->timestamp,
> > > + .count = 1,
> > > + };
> > > + bpf_map_update_elem(&contention_owner_tracing,
> > > + &(pelem->lock), &first, BPF_ANY);
> > > + bpf_map_update_elem(&contention_owner_stacks,
> > > + &(pelem->lock), entries, BPF_ANY);
> >
> > Hmm.. it just discard the old owner data if it comes from a new owner?
> > Why not save the data into the result for the old lock/callstack?
>
> There are two conditions which enter this if statement:
> 1. (!data) contention just started, `&pelem->lock` entry in
> `contetion_owner_tracing` is empty.
> 2. (data->pid != owner_pid) Some internal errors so `data->pid` is
> misaligned with `owner_pid`. In this case the timestamp would be
> incorrect so I prefer to drop it.
> WDYT?
Ok, now I think that it should see contention_end from the earlier
waiter for the second case so it should be rare. Probably ok to drop
it for now.
>
> >
> >
> > > + }
> > > + // Contention is going on and new waiter joins.
> > > + else {
> > > + __sync_fetch_and_add(&data->count, 1);
> > > + // TODO: Since for owner the callstack would change at
> > > + // different time, We should check and report if the
> > > + // callstack is different with the recorded one in
> > > + // `contention_owner_stacks`.
> > > + }
> > > +contention_begin_skip_owner_callstack:
> > > pelem->stack_id = bpf_get_stackid(ctx, &stacks,
> > > BPF_F_FAST_STACK_CMP | stack_skip);
> > > if (pelem->stack_id < 0)
> > > @@ -443,6 +501,7 @@ int contention_end(u64 *ctx)
> > > struct tstamp_data *pelem;
> > > struct contention_key key = {};
> > > struct contention_data *data;
> > > + __u64 timestamp;
> > > __u64 duration;
> > > bool need_delete = false;
> > >
> > > @@ -469,12 +528,103 @@ int contention_end(u64 *ctx)
> > > return 0;
> > > need_delete = true;
> > > }
> > > - duration = bpf_ktime_get_ns() - pelem->timestamp;
> > > + timestamp = bpf_ktime_get_ns();
> > > + duration = timestamp - pelem->timestamp;
> > > if ((__s64)duration < 0) {
> > > __sync_fetch_and_add(&time_fail, 1);
> > > goto out;
> > > }
> > >
> > > + if (needs_callstack && lock_owner) {
> > > + u64 owner_contention_time;
> > > + unsigned long *owner_stack;
> > > + struct contention_data *cdata;
> > > + cotd *otdata;
> > > +
> > > + otdata = bpf_map_lookup_elem(&contention_owner_tracing,
> > > + &(pelem->lock));
> > > + owner_stack = bpf_map_lookup_elem(&contention_owner_stacks,
> > > + &(pelem->lock));
> > > + if (!otdata || !owner_stack)
> > > + goto contention_end_skip_owner_callstack;
> > > +
> > > + owner_contention_time = timestamp - otdata->timestamp;
> > > +
> > > + // Update `owner_lock_stat` if `owner_stack` is
> > > + // available.
> > > + if (owner_stack[0] != 0x0) {
> > > + cdata = bpf_map_lookup_elem(&owner_lock_stat,
> > > + owner_stack);
> > > + if (!cdata) {
> > > + struct contention_data first = {
> > > + .total_time = owner_contention_time,
> > > + .max_time = owner_contention_time,
> > > + .min_time = owner_contention_time,
> > > + .count = 1,
> > > + .flags = pelem->flags,
> > > + };
> > > + bpf_map_update_elem(&owner_lock_stat,
> > > + owner_stack, &first,
> > > + BPF_ANY);
> > > + } else {
> > > + __sync_fetch_and_add(&cdata->total_time,
> > > + owner_contention_time);
> > > + __sync_fetch_and_add(&cdata->count, 1);
> > > +
> > > + /* FIXME: need atomic operations */
> > > + if (cdata->max_time < owner_contention_time)
> > > + cdata->max_time = owner_contention_time;
> > > + if (cdata->min_time > owner_contention_time)
> > > + cdata->min_time = owner_contention_time;
> > > + }
> > > + }
> > > +
> > > + // No contention is going on, delete `lock` in
> > > + // `contention_owner_tracing` and
> > > + // `contention_owner_stacks`
> > > + if (otdata->count <= 1) {
> > > + bpf_map_delete_elem(&contention_owner_tracing,
> > > + &(pelem->lock));
> > > + bpf_map_delete_elem(&contention_owner_stacks,
> > > + &(pelem->lock));
> > > + }
> > > + // Contention is still going on, with a new owner
> > > + // (current task). `otdata` should be updated accordingly.
> > > + else {
> > > + (otdata->count)--;
> >
> > No need for parenthesis, and it needs to be atomic dec.
> >
> > > +
> > > + // If ctx[1] is not 0, the current task terminates lock
> > > + // waiting without acquiring it. Owner is not changed.
> >
> > Please add a comment that ctx[1] has the return code of the lock
> > function. Maybe it's better to use a local variable.
> >
> > Also I think you need to say about the normal case too. Returing 0
> > means the waiter now gets the lock and becomes a new owner. So it needs
> > to update the owner information.
> >
> >
> > > + if (ctx[1] == 0) {
> > > + u32 i = 0;
> > > + unsigned long *entries = bpf_map_lookup_elem(
> > > + &owner_stacks_entries, &i);
> > > + if (entries) {
> > > + for (i = 0; i < (u32)max_stack; i++)
> > > + entries[i] = 0x0;
> > > +
> > > + bpf_get_task_stack(
> > > + bpf_get_current_task_btf(),
> >
> > Same as bpf_get_stack(), right?
> >
> > > + entries,
> > > + max_stack *
> > > + sizeof(unsigned long),
> > > + 0);
> > > + bpf_map_update_elem(
> > > + &contention_owner_stacks,
> > > + &(pelem->lock), entries,
> > > + BPF_ANY);
> >
> > Please factor out the code if it indents too much. Or you can use goto
> > or something to reduce the indentation level.
>
> I will reindent it with `ColumnLimit=100`. I was using 80 since it was
> predefined in `.clang-format`, looks outdated but no one updated it..
Probably you can send a patch. :)
But it still holds the same, please try not to indent a lot (and user
shorter names).
Thanks,
Namhyung
>
> >
> > if (ret != 0)
> > goto skip_update;
> >
> > ...
> >
> > if (entries == NULL)
> > goto skip_stack;
> >
> > ...
> >
> > Thanks,
> > Namhyung
> >
> > > + }
> > > +
> > > + otdata->pid = pid;
> > > + otdata->timestamp = timestamp;
> > > + }
> > > +
> > > + bpf_map_update_elem(&contention_owner_tracing,
> > > + &(pelem->lock), otdata, BPF_ANY);
> > > + }
> > > + }
> > > +contention_end_skip_owner_callstack:
> > > +
> > > switch (aggr_mode) {
> > > case LOCK_AGGR_CALLER:
> > > key.stack_id = pelem->stack_id;
> > > --
> > > 2.47.1.688.g23fc6f90ad-goog
> > >
© 2016 - 2026 Red Hat, Inc.