[PATCH 4/6] perf annotate-data: Check memory access with two registers

Namhyung Kim posted 6 patches 1 year, 7 months ago
[PATCH 4/6] perf annotate-data: Check memory access with two registers
Posted by Namhyung Kim 1 year, 7 months ago
The following instruction pattern is used to access a global variable.

  mov     $0x231c0, %rax
  movsql  %edi, %rcx
  mov     -0x7dc94ae0(,%rcx,8), %rcx
  cmpl    $0x0, 0xa60(%rcx,%rax,1)     <<<--- here

The first instruction set the address of the per-cpu variable (here, it
is 'runqueus' of struct rq).  The second instruction seems like a cpu
number of the per-cpu base.  The third instruction get the base offset
of per-cpu area for that cpu.  The last instruction compares the value
of the per-cpu variable at the offset of 0xa60.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-data.c | 44 +++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index f1e52a531563..245e3ef3e2ff 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1031,22 +1031,37 @@ static void update_insn_state_x86(struct type_state *state,
 		else if (has_reg_type(state, sreg) &&
 			 state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
 			u64 ip = dloc->ms->sym->start + dl->al.offset;
+			u64 var_addr = src->offset;
 			int offset;
 
+			if (src->multi_regs) {
+				int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
+
+				if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+				    state->regs[reg2].kind == TSR_KIND_CONST)
+					var_addr += state->regs[reg2].imm_value;
+			}
+
 			/*
 			 * In kernel, %gs points to a per-cpu region for the
 			 * current CPU.  Access with a constant offset should
 			 * be treated as a global variable access.
 			 */
-			if (get_global_var_type(cu_die, dloc, ip, src->offset,
+			if (get_global_var_type(cu_die, dloc, ip, var_addr,
 						&offset, &type_die) &&
 			    die_get_member_type(&type_die, offset, &type_die)) {
 				tsr->type = type_die;
 				tsr->kind = TSR_KIND_TYPE;
 				tsr->ok = true;
 
-				pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
-					     insn_offset, src->offset, sreg, dst->reg1);
+				if (src->multi_regs) {
+					pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
+						     insn_offset, src->offset, src->reg1,
+						     src->reg2, dst->reg1);
+				} else {
+					pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
+						     insn_offset, src->offset, sreg, dst->reg1);
+				}
 				pr_debug_type_name(&tsr->type, tsr->kind);
 			} else {
 				tsr->ok = false;
@@ -1340,6 +1355,17 @@ static int check_matching_type(struct type_state *state,
 
 		pr_debug_dtp(" percpu var\n");
 
+		if (dloc->op->multi_regs) {
+			int reg2 = dloc->op->reg2;
+
+			if (dloc->op->reg2 == reg)
+				reg2 = dloc->op->reg1;
+
+			if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+			    state->regs[reg2].kind == TSR_KIND_CONST)
+				var_addr += state->regs[reg2].imm_value;
+		}
+
 		if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
 					&var_offset, type_die)) {
 			dloc->type_offset = var_offset;
@@ -1527,8 +1553,16 @@ static int find_data_type_block(struct data_loc_info *dloc, int reg,
 		found = find_data_type_insn(dloc, reg, &basic_blocks, var_types,
 					    cu_die, type_die);
 		if (found > 0) {
-			pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x\n",
-				     dloc->op->offset, reg, dloc->type_offset);
+			char buf[64];
+
+			if (dloc->op->multi_regs)
+				snprintf(buf, sizeof(buf), "reg%d, reg%d",
+					 dloc->op->reg1, dloc->op->reg2);
+			else
+				snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1);
+
+			pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
+				     dloc->op->offset, buf, dloc->type_offset);
 			pr_debug_type_name(type_die, TSR_KIND_TYPE);
 			ret = 0;
 			break;
-- 
2.45.0.rc1.225.g2a3ae87e7f-goog
Re: [PATCH 4/6] perf annotate-data: Check memory access with two registers
Posted by Arnaldo Carvalho de Melo 1 year, 7 months ago
On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> The following instruction pattern is used to access a global variable.
> 
>   mov     $0x231c0, %rax
>   movsql  %edi, %rcx
>   mov     -0x7dc94ae0(,%rcx,8), %rcx
>   cmpl    $0x0, 0xa60(%rcx,%rax,1)     <<<--- here
> 
> The first instruction set the address of the per-cpu variable (here, it
> is 'runqueus' of struct rq).  The second instruction seems like a cpu

You mean 'runqueues', i.e. this one:

kernel/sched/core.c
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

?

But that 0xa60 would be in an alignment hole, at least in:

$ pahole --hex rq | egrep 0xa40 -A12
	struct mm_struct *         prev_mm;              /* 0xa40   0x8 */
	unsigned int               clock_update_flags;   /* 0xa48   0x4 */

	/* XXX 4 bytes hole, try to pack */

	u64                        clock;                /* 0xa50   0x8 */

	/* XXX 40 bytes hole, try to pack */

	/* --- cacheline 42 boundary (2688 bytes) --- */
	u64                        clock_task __attribute__((__aligned__(64))); /* 0xa80   0x8 */
	u64                        clock_pelt;           /* 0xa88   0x8 */
	long unsigned int          lost_idle_time;       /* 0xa90   0x8 */
$ uname -a
Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
$

The paragraph then reads:

----
The first instruction set the address of the per-cpu variable (here, it
is 'runqueues' of type 'struct rq').  The second instruction seems like
a cpu number of the per-cpu base.  The third instruction get the base
offset of per-cpu area for that cpu.  The last instruction compares the
value of the per-cpu variable at the offset of 0xa60.
----

Ok?

> number of the per-cpu base.  The third instruction get the base offset
> of per-cpu area for that cpu.  The last instruction compares the value
> of the per-cpu variable at the offset of 0xa60.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
>  tools/perf/util/annotate-data.c | 44 +++++++++++++++++++++++++++++----
>  1 file changed, 39 insertions(+), 5 deletions(-)
> 
> diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
> index f1e52a531563..245e3ef3e2ff 100644
> --- a/tools/perf/util/annotate-data.c
> +++ b/tools/perf/util/annotate-data.c
> @@ -1031,22 +1031,37 @@ static void update_insn_state_x86(struct type_state *state,
>  		else if (has_reg_type(state, sreg) &&
>  			 state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
>  			u64 ip = dloc->ms->sym->start + dl->al.offset;
> +			u64 var_addr = src->offset;
>  			int offset;
>  
> +			if (src->multi_regs) {
> +				int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
> +
> +				if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
> +				    state->regs[reg2].kind == TSR_KIND_CONST)
> +					var_addr += state->regs[reg2].imm_value;
> +			}
> +
>  			/*
>  			 * In kernel, %gs points to a per-cpu region for the
>  			 * current CPU.  Access with a constant offset should
>  			 * be treated as a global variable access.
>  			 */
> -			if (get_global_var_type(cu_die, dloc, ip, src->offset,
> +			if (get_global_var_type(cu_die, dloc, ip, var_addr,
>  						&offset, &type_die) &&
>  			    die_get_member_type(&type_die, offset, &type_die)) {
>  				tsr->type = type_die;
>  				tsr->kind = TSR_KIND_TYPE;
>  				tsr->ok = true;
>  
> -				pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
> -					     insn_offset, src->offset, sreg, dst->reg1);
> +				if (src->multi_regs) {
> +					pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
> +						     insn_offset, src->offset, src->reg1,
> +						     src->reg2, dst->reg1);
> +				} else {
> +					pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
> +						     insn_offset, src->offset, sreg, dst->reg1);
> +				}
>  				pr_debug_type_name(&tsr->type, tsr->kind);
>  			} else {
>  				tsr->ok = false;
> @@ -1340,6 +1355,17 @@ static int check_matching_type(struct type_state *state,
>  
>  		pr_debug_dtp(" percpu var\n");
>  
> +		if (dloc->op->multi_regs) {
> +			int reg2 = dloc->op->reg2;
> +
> +			if (dloc->op->reg2 == reg)
> +				reg2 = dloc->op->reg1;
> +
> +			if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
> +			    state->regs[reg2].kind == TSR_KIND_CONST)
> +				var_addr += state->regs[reg2].imm_value;
> +		}
> +
>  		if (get_global_var_type(cu_die, dloc, dloc->ip, var_addr,
>  					&var_offset, type_die)) {
>  			dloc->type_offset = var_offset;
> @@ -1527,8 +1553,16 @@ static int find_data_type_block(struct data_loc_info *dloc, int reg,
>  		found = find_data_type_insn(dloc, reg, &basic_blocks, var_types,
>  					    cu_die, type_die);
>  		if (found > 0) {
> -			pr_debug_dtp("found by insn track: %#x(reg%d) type-offset=%#x\n",
> -				     dloc->op->offset, reg, dloc->type_offset);
> +			char buf[64];
> +
> +			if (dloc->op->multi_regs)
> +				snprintf(buf, sizeof(buf), "reg%d, reg%d",
> +					 dloc->op->reg1, dloc->op->reg2);
> +			else
> +				snprintf(buf, sizeof(buf), "reg%d", dloc->op->reg1);
> +
> +			pr_debug_dtp("found by insn track: %#x(%s) type-offset=%#x\n",
> +				     dloc->op->offset, buf, dloc->type_offset);
>  			pr_debug_type_name(type_die, TSR_KIND_TYPE);
>  			ret = 0;
>  			break;
> -- 
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
Re: [PATCH 4/6] perf annotate-data: Check memory access with two registers
Posted by Namhyung Kim 1 year, 7 months ago
On Thu, May 2, 2024 at 7:05 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
>
> On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> > The following instruction pattern is used to access a global variable.
> >
> >   mov     $0x231c0, %rax
> >   movsql  %edi, %rcx
> >   mov     -0x7dc94ae0(,%rcx,8), %rcx
> >   cmpl    $0x0, 0xa60(%rcx,%rax,1)     <<<--- here
> >
> > The first instruction set the address of the per-cpu variable (here, it
> > is 'runqueus' of struct rq).  The second instruction seems like a cpu
>
> You mean 'runqueues', i.e. this one:
>
> kernel/sched/core.c
> DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
>
> ?

Right, sorry for the typo.

>
> But that 0xa60 would be in an alignment hole, at least in:
>
> $ pahole --hex rq | egrep 0xa40 -A12
>         struct mm_struct *         prev_mm;              /* 0xa40   0x8 */
>         unsigned int               clock_update_flags;   /* 0xa48   0x4 */
>
>         /* XXX 4 bytes hole, try to pack */
>
>         u64                        clock;                /* 0xa50   0x8 */
>
>         /* XXX 40 bytes hole, try to pack */
>
>         /* --- cacheline 42 boundary (2688 bytes) --- */
>         u64                        clock_task __attribute__((__aligned__(64))); /* 0xa80   0x8 */
>         u64                        clock_pelt;           /* 0xa88   0x8 */
>         long unsigned int          lost_idle_time;       /* 0xa90   0x8 */
> $ uname -a
> Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
> $

This would be different on kernel version, config and
other changes like backports or local modifications.

On my system, it was cpu_stop_work.arg.

$ pahole --hex rq | grep 0xa40 -C1
    /* --- cacheline 41 boundary (2624 bytes) --- */
    struct cpu_stop_work       active_balance_work;  /* 0xa40  0x30 */
    int                        cpu;                  /* 0xa70   0x4 */

$ pahole --hex cpu_stop_work
struct cpu_stop_work {
    struct list_head           list;                 /*     0  0x10 */
    cpu_stop_fn_t              fn;                   /*  0x10   0x8 */
    long unsigned int          caller;               /*  0x18   0x8 */
    void *                     arg;                  /*  0x20   0x8 */
    struct cpu_stop_done *     done;                 /*  0x28   0x8 */

    /* size: 48, cachelines: 1, members: 5 */
    /* last cacheline: 48 bytes */
};


>
> The paragraph then reads:
>
> ----
> The first instruction set the address of the per-cpu variable (here, it
> is 'runqueues' of type 'struct rq').  The second instruction seems like
> a cpu number of the per-cpu base.  The third instruction get the base
> offset of per-cpu area for that cpu.  The last instruction compares the
> value of the per-cpu variable at the offset of 0xa60.
> ----
>
> Ok?

Yep, looks good.

Thanks,
Namhyung
Re: [PATCH 4/6] perf annotate-data: Check memory access with two registers
Posted by Arnaldo Carvalho de Melo 1 year, 7 months ago
On Thu, May 02, 2024 at 11:14:50AM -0700, Namhyung Kim wrote:
> On Thu, May 2, 2024 at 7:05 AM Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> >
> > On Wed, May 01, 2024 at 11:00:09PM -0700, Namhyung Kim wrote:
> > > The following instruction pattern is used to access a global variable.
> > >
> > >   mov     $0x231c0, %rax
> > >   movsql  %edi, %rcx
> > >   mov     -0x7dc94ae0(,%rcx,8), %rcx
> > >   cmpl    $0x0, 0xa60(%rcx,%rax,1)     <<<--- here
> > >
> > > The first instruction set the address of the per-cpu variable (here, it
> > > is 'runqueus' of struct rq).  The second instruction seems like a cpu
> >
> > You mean 'runqueues', i.e. this one:
> >
> > kernel/sched/core.c
> > DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> >
> > ?
> 
> Right, sorry for the typo.
> 
> >
> > But that 0xa60 would be in an alignment hole, at least in:
> >
> > $ pahole --hex rq | egrep 0xa40 -A12
> >         struct mm_struct *         prev_mm;              /* 0xa40   0x8 */
> >         unsigned int               clock_update_flags;   /* 0xa48   0x4 */
> >
> >         /* XXX 4 bytes hole, try to pack */
> >
> >         u64                        clock;                /* 0xa50   0x8 */
> >
> >         /* XXX 40 bytes hole, try to pack */
> >
> >         /* --- cacheline 42 boundary (2688 bytes) --- */
> >         u64                        clock_task __attribute__((__aligned__(64))); /* 0xa80   0x8 */
> >         u64                        clock_pelt;           /* 0xa88   0x8 */
> >         long unsigned int          lost_idle_time;       /* 0xa90   0x8 */
> > $ uname -a
> > Linux toolbox 6.7.11-200.fc39.x86_64 #1 SMP PREEMPT_DYNAMIC Wed Mar 27 16:50:39 UTC 2024 x86_64 GNU/Linux
> > $
> 
> This would be different on kernel version, config and
> other changes like backports or local modifications.
> 
> On my system, it was cpu_stop_work.arg.

Sure, so please include the pahole output for the data that lead you to
the conclusions in the explanation for the results obtained, so that we
can have a better mental map of all the pieces and thus get convinced of
the results and have a way to try to reproduce it in our systems.

In the future we will be grateful to this effort when looking back at
these patches :-)

Thanks for all your work in these features!

- Arnaldo
 
> $ pahole --hex rq | grep 0xa40 -C1
>     /* --- cacheline 41 boundary (2624 bytes) --- */
>     struct cpu_stop_work       active_balance_work;  /* 0xa40  0x30 */
>     int                        cpu;                  /* 0xa70   0x4 */
> 
> $ pahole --hex cpu_stop_work
> struct cpu_stop_work {
>     struct list_head           list;                 /*     0  0x10 */
>     cpu_stop_fn_t              fn;                   /*  0x10   0x8 */
>     long unsigned int          caller;               /*  0x18   0x8 */
>     void *                     arg;                  /*  0x20   0x8 */
>     struct cpu_stop_done *     done;                 /*  0x28   0x8 */
> 
>     /* size: 48, cachelines: 1, members: 5 */
>     /* last cacheline: 48 bytes */
> };
> 
> 
> >
> > The paragraph then reads:
> >
> > ----
> > The first instruction set the address of the per-cpu variable (here, it
> > is 'runqueues' of type 'struct rq').  The second instruction seems like
> > a cpu number of the per-cpu base.  The third instruction get the base
> > offset of per-cpu area for that cpu.  The last instruction compares the
> > value of the per-cpu variable at the offset of 0xa60.
> > ----
> >
> > Ok?
> 
> Yep, looks good.
> 
> Thanks,
> Namhyung