The following instruction pattern is used to access a global variable.
mov $0x231c0, %rax
movsql %edi, %rcx
mov -0x7dc94ae0(,%rcx,8), %rcx
cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here
The first instruction set the address of the per-cpu variable (here, it
is 'runqueus' of struct rq). The second instruction seems like a cpu
number of the per-cpu base. The third instruction get the base offset
of per-cpu area for that cpu. The last instruction compares the value
of the per-cpu variable at the offset of 0xa60.
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
tools/perf/util/annotate-data.c | 44 +++++++++++++++++++++++++++++----
1 file changed, 39 insertions(+), 5 deletions(-)
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index f1e52a531563..245e3ef3e2ff 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1031,22 +1031,37 @@ static void update_insn_state_x86(struct type_state *state,
else if (has_reg_type(state, sreg) &&
state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
u64 ip = dloc->ms->sym->start + dl->al.offset;
+ u64 var_addr = src->offset;
int offset;
+ if (src->multi_regs) {
+ int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
+
+ if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+ state->regs[reg2].kind == TSR_KIND_CONST)
+ var_addr += state->regs[reg2].imm_value;
+ }
+
/*
* In kernel, %gs points to a per-cpu region for the
* current CPU. Access with a constant offset should
* be treated as a global variable access.
*/
- if (get_global_var_type(cu_die, dloc, ip, src->offset,
+ if (get_global_var_type(cu_die, dloc, ip, var_addr,
&offset, &type_die) &&
die_get_member_type(&type_die, offset, &type_die)) {
tsr->type = type_die;
tsr->kind = TSR_KIND_TYPE;
tsr->ok = true;
- pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
- insn_offset, src->offset, sreg, dst->reg1);
+ if (src->multi_regs) {
+ pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
+ insn_offset, src->offset, src->reg1,
+ src->reg2, dst->reg1);
+ } else {
+ pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
+ insn_offset, src->offset, sreg, dst->reg1);
+ }
pr_debug_type_name(&tsr->type, tsr->kind);
} else {
tsr->ok = false;
@@ -1340,6 +1355,17 @@ static int check_matching_type(struct type_state *state,
pr_debug_dtp(" percpu var\n");
+ if (dloc->op->multi_regs) {
+ int reg2 = dloc->op->reg2;
+
+ if (dloc->op->reg2 == reg)
+ reg2 = dloc->op->reg1;
+
+ if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+ state->regs[reg2].kind == TSR_KIND_CONST)
+ var_addr += state->regs[reg2].imm_value;
+ }
+
if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
&var_offset, type_die)) {
dloc->type_offset = var_offset;
@@ -1527,8 +1553,16 @@ static int find_data_type_block(struct data_loc_info *dloc, int reg,
found = find_data_type_insn(dloc, reg, &basic_blocks, var_types,
cu_die, type_die);
if (found > 0) {
- pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x\n",
- dloc->op->offset, reg, dloc->type_offset);
+ char buf[64];
+
+ if (dloc->op->multi_regs)
+ snprintf(buf, sizeof(buf), "reg%d, reg%d",
+ dloc->op->reg1, dloc->op->reg2);
+ else
+ snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1);
+
+ pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
+ dloc->op->offset, buf, dloc->type_offset);
pr_debug_type_name(type_die, TSR_KIND_TYPE);
ret = 0;
break;
--
2.45.0.rc1.225.g2a3ae87e7f-goog
On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> The following instruction pattern is used to access a global variable.
>
> mov $0x231c0, %rax
> movsql %edi, %rcx
> mov -0x7dc94ae0(,%rcx,8), %rcx
> cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here
>
> The first instruction set the address of the per-cpu variable (here, it
> is 'runqueus' of struct rq). The second instruction seems like a cpu
You mean 'runqueues', i.e. this one:
kernel/sched/core.c
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
?
But that 0xa60 would be in an alignment hole, at least in:
$ pahole --hex rq | egrep 0xa40 -A12
struct mm_struct * prev_mm; /* 0xa40 0x8 */
unsigned int clock_update_flags; /* 0xa48 0x4 */
/* XXX 4 bytes hole, try to pack */
u64 clock; /* 0xa50 0x8 */
/* XXX 40 bytes hole, try to pack */
/* --- cacheline 42 boundary (2688 bytes) --- */
u64 clock_task __attribute__((__aligned__(64))); /* 0xa80 0x8 */
u64 clock_pelt; /* 0xa88 0x8 */
long unsigned int lost_idle_time; /* 0xa90 0x8 */
$ uname -a
Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
$
The paragraph then reads:
----
The first instruction set the address of the per-cpu variable (here, it
is 'runqueues' of type 'struct rq'). The second instruction seems like
a cpu number of the per-cpu base. The third instruction get the base
offset of per-cpu area for that cpu. The last instruction compares the
value of the per-cpu variable at the offset of 0xa60.
----
Ok?
> number of the per-cpu base. The third instruction get the base offset
> of per-cpu area for that cpu. The last instruction compares the value
> of the per-cpu variable at the offset of 0xa60.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> tools/perf/util/annotate-data.c | 44 +++++++++++++++++++++++++++++----
> 1 file changed, 39 insertions(+), 5 deletions(-)
>
> diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
> index f1e52a531563..245e3ef3e2ff 100644
> --- a/tools/perf/util/annotate-data.c
> +++ b/tools/perf/util/annotate-data.c
> @@ -1031,22 +1031,37 @@ static void update_insn_state_x86(struct type_state *state,
> else if (has_reg_type(state, sreg) &&
> state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
> u64 ip = dloc->ms->sym->start + dl->al.offset;
> + u64 var_addr = src->offset;
> int offset;
>
> + if (src->multi_regs) {
> + int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
> +
> + if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
> + state->regs[reg2].kind == TSR_KIND_CONST)
> + var_addr += state->regs[reg2].imm_value;
> + }
> +
> /*
> * In kernel, %gs points to a per-cpu region for the
> * current CPU. Access with a constant offset should
> * be treated as a global variable access.
> */
> - if (get_global_var_type(cu_die, dloc, ip, src->offset,
> + if (get_global_var_type(cu_die, dloc, ip, var_addr,
> &offset, &type_die) &&
> die_get_member_type(&type_die, offset, &type_die)) {
> tsr->type = type_die;
> tsr->kind = TSR_KIND_TYPE;
> tsr->ok = true;
>
> - pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
> - insn_offset, src->offset, sreg, dst->reg1);
> + if (src->multi_regs) {
> + pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
> + insn_offset, src->offset, src->reg1,
> + src->reg2, dst->reg1);
> + } else {
> + pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
> + insn_offset, src->offset, sreg, dst->reg1);
> + }
> pr_debug_type_name(&tsr->type, tsr->kind);
> } else {
> tsr->ok = false;
> @@ -1340,6 +1355,17 @@ static int check_matching_type(struct type_state *state,
>
> pr_debug_dtp(" percpu var\n");
>
> + if (dloc->op->multi_regs) {
> + int reg2 = dloc->op->reg2;
> +
> + if (dloc->op->reg2 == reg)
> + reg2 = dloc->op->reg1;
> +
> + if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
> + state->regs[reg2].kind == TSR_KIND_CONST)
> + var_addr += state->regs[reg2].imm_value;
> + }
> +
> if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
> &var_offset, type_die)) {
> dloc->type_offset = var_offset;
> @@ -1527,8 +1553,16 @@ static int find_data_type_block(struct data_loc_info *dloc, int reg,
> found = find_data_type_insn(dloc, reg, &basic_blocks, var_types,
> cu_die, type_die);
> if (found > 0) {
> - pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x\n",
> - dloc->op->offset, reg, dloc->type_offset);
> + char buf[64];
> +
> + if (dloc->op->multi_regs)
> + snprintf(buf, sizeof(buf), "reg%d, reg%d",
> + dloc->op->reg1, dloc->op->reg2);
> + else
> + snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1);
> +
> + pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
> + dloc->op->offset, buf, dloc->type_offset);
> pr_debug_type_name(type_die, TSR_KIND_TYPE);
> ret = 0;
> break;
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
On Thu, May 2, 2024 at 7:05 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
>
> On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> > The following instruction pattern is used to access a global variable.
> >
> > mov $0x231c0, %rax
> > movsql %edi, %rcx
> > mov -0x7dc94ae0(,%rcx,8), %rcx
> > cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here
> >
> > The first instruction set the address of the per-cpu variable (here, it
> > is 'runqueus' of struct rq). The second instruction seems like a cpu
>
> You mean 'runqueues', i.e. this one:
>
> kernel/sched/core.c
> DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>
> ?
Right, sorry for the typo.
>
> But that 0xa60 would be in an alignment hole, at least in:
>
> $ pahole --hex rq | egrep 0xa40 -A12
> struct mm_struct * prev_mm; /* 0xa40 0x8 */
> unsigned int clock_update_flags; /* 0xa48 0x4 */
>
> /* XXX 4 bytes hole, try to pack */
>
> u64 clock; /* 0xa50 0x8 */
>
> /* XXX 40 bytes hole, try to pack */
>
> /* --- cacheline 42 boundary (2688 bytes) --- */
> u64 clock_task __attribute__((__aligned__(64))); /* 0xa80 0x8 */
> u64 clock_pelt; /* 0xa88 0x8 */
> long unsigned int lost_idle_time; /* 0xa90 0x8 */
> $ uname -a
> Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
> $
This would be different on kernel version, config and
other changes like backports or local modifications.
On my system, it was cpu_stop_work.arg.
$ pahole --hex rq | grep 0xa40 -C1
/* --- cacheline 41 boundary (2624 bytes) --- */
struct cpu_stop_work active_balance_work; /* 0xa40 0x30 */
int cpu; /* 0xa70 0x4 */
$ pahole --hex cpu_stop_work
struct cpu_stop_work {
struct list_head list; /* 0 0x10 */
cpu_stop_fn_t fn; /* 0x10 0x8 */
long unsigned int caller; /* 0x18 0x8 */
void * arg; /* 0x20 0x8 */
struct cpu_stop_done * done; /* 0x28 0x8 */
/* size: 48, cachelines: 1, members: 5 */
/* last cacheline: 48 bytes */
};
>
> The paragraph then reads:
>
> ----
> The first instruction set the address of the per-cpu variable (here, it
> is 'runqueues' of type 'struct rq'). The second instruction seems like
> a cpu number of the per-cpu base. The third instruction get the base
> offset of per-cpu area for that cpu. The last instruction compares the
> value of the per-cpu variable at the offset of 0xa60.
> ----
>
> Ok?
Yep, looks good.
Thanks,
Namhyung
On Thu, May 02, 2024 at 11:14:50AM -0700, Namhyung Kim wrote:
> On Thu, May 2, 2024 at 7:05 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> >
> > On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> > > The following instruction pattern is used to access a global variable.
> > >
> > > mov $0x231c0, %rax
> > > movsql %edi, %rcx
> > > mov -0x7dc94ae0(,%rcx,8), %rcx
> > > cmpl $0x0, 0xa60(%rcx,%rax,1) <<<--- here
> > >
> > > The first instruction set the address of the per-cpu variable (here, it
> > > is 'runqueus' of struct rq). The second instruction seems like a cpu
> >
> > You mean 'runqueues', i.e. this one:
> >
> > kernel/sched/core.c
> > DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> >
> > ?
>
> Right, sorry for the typo.
>
> >
> > But that 0xa60 would be in an alignment hole, at least in:
> >
> > $ pahole --hex rq | egrep 0xa40 -A12
> > struct mm_struct * prev_mm; /* 0xa40 0x8 */
> > unsigned int clock_update_flags; /* 0xa48 0x4 */
> >
> > /* XXX 4 bytes hole, try to pack */
> >
> > u64 clock; /* 0xa50 0x8 */
> >
> > /* XXX 40 bytes hole, try to pack */
> >
> > /* --- cacheline 42 boundary (2688 bytes) --- */
> > u64 clock_task __attribute__((__aligned__(64))); /* 0xa80 0x8 */
> > u64 clock_pelt; /* 0xa88 0x8 */
> > long unsigned int lost_idle_time; /* 0xa90 0x8 */
> > $ uname -a
> > Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
> > $
>
> This would be different on kernel version, config and
> other changes like backports or local modifications.
>
> On my system, it was cpu_stop_work.arg.
Sure, so please include the pahole output for the data that lead you to
the conclusions in the explanation for the results obtained, so that we
can have a better mental map of all the pieces and thus get convinced of
the results and have a way to try to reproduce it in our systems.
In the future we will be grateful to this effort when looking back at
these patches :-)
Thanks for all your work in these features!
- Arnaldo
> $ pahole --hex rq | grep 0xa40 -C1
> /* --- cacheline 41 boundary (2624 bytes) --- */
> struct cpu_stop_work active_balance_work; /* 0xa40 0x30 */
> int cpu; /* 0xa70 0x4 */
>
> $ pahole --hex cpu_stop_work
> struct cpu_stop_work {
> struct list_head list; /* 0 0x10 */
> cpu_stop_fn_t fn; /* 0x10 0x8 */
> long unsigned int caller; /* 0x18 0x8 */
> void * arg; /* 0x20 0x8 */
> struct cpu_stop_done * done; /* 0x28 0x8 */
>
> /* size: 48, cachelines: 1, members: 5 */
> /* last cacheline: 48 bytes */
> };
>
>
> >
> > The paragraph then reads:
> >
> > ----
> > The first instruction set the address of the per-cpu variable (here, it
> > is 'runqueues' of type 'struct rq'). The second instruction seems like
> > a cpu number of the per-cpu base. The third instruction get the base
> > offset of per-cpu area for that cpu. The last instruction compares the
> > value of the per-cpu variable at the offset of 0xa60.
> > ----
> >
> > Ok?
>
> Yep, looks good.
>
> Thanks,
> Namhyung
© 2016 - 2025 Red Hat, Inc.