[PATCH v7 4/6] perf record: Track sideband events for all CPUs when tracing selected CPUs

Yang Jihong posted 6 patches 2 years, 3 months ago
There is a newer version of this series
[PATCH v7 4/6] perf record: Track sideband events for all CPUs when tracing selected CPUs
Posted by Yang Jihong 2 years, 3 months ago
User space tasks can migrate between CPUs, we need to track side-band
events for all CPUs.

The specific scenarios are as follows:

         CPU0                                 CPU1
  perf record -C 0 start
                              taskA starts to be created and executed
                                -> PERF_RECORD_COMM and PERF_RECORD_MMAP
                                   events only deliver to CPU1
                              ......
                                |
                          migrate to CPU0
                                |
  Running on CPU0    <----------/
  ...

  perf record -C 0 stop

Now perf samples the PC of taskA. However, perf does not record the
PERF_RECORD_COMM and PERF_RECORD_MMAP events of taskA.
Therefore, the comm and symbols of taskA cannot be parsed.

The solution is to record sideband events for all CPUs when tracing
selected CPUs. Because this modifies the default behavior, add related
comments to the perf record man page.

The sys_perf_event_open invoked is as follows:

  # perf --debug verbose=3 record -e cpu-clock -C 1 true
  <SNIP>
  Opening: cpu-clock
  ------------------------------------------------------------
  perf_event_attr:
    type                             1 (PERF_TYPE_SOFTWARE)
    size                             136
    config                           0 (PERF_COUNT_SW_CPU_CLOCK)
    { sample_period, sample_freq }   4000
    sample_type                      IP|TID|TIME|CPU|PERIOD|IDENTIFIER
    read_format                      ID|LOST
    disabled                         1
    inherit                          1
    freq                             1
    sample_id_all                    1
    exclude_guest                    1
  ------------------------------------------------------------
  sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 5
  Opening: dummy:u
  ------------------------------------------------------------
  perf_event_attr:
    type                             1 (PERF_TYPE_SOFTWARE)
    size                             136
    config                           0x9 (PERF_COUNT_SW_DUMMY)
    { sample_period, sample_freq }   1
    sample_type                      IP|TID|TIME|CPU|IDENTIFIER
    read_format                      ID|LOST
    inherit                          1
    exclude_kernel                   1
    exclude_hv                       1
    mmap                             1
    comm                             1
    task                             1
    sample_id_all                    1
    exclude_guest                    1
    mmap2                            1
    comm_exec                        1
    ksymbol                          1
    bpf_event                        1
  ------------------------------------------------------------
  sys_perf_event_open: pid -1  cpu 0  group_fd -1  flags 0x8 = 6
  sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 7
  sys_perf_event_open: pid -1  cpu 2  group_fd -1  flags 0x8 = 9
  sys_perf_event_open: pid -1  cpu 3  group_fd -1  flags 0x8 = 10
  sys_perf_event_open: pid -1  cpu 4  group_fd -1  flags 0x8 = 11
  sys_perf_event_open: pid -1  cpu 5  group_fd -1  flags 0x8 = 12
  sys_perf_event_open: pid -1  cpu 6  group_fd -1  flags 0x8 = 13
  sys_perf_event_open: pid -1  cpu 7  group_fd -1  flags 0x8 = 14
  <SNIP>

Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
---
 tools/perf/Documentation/perf-record.txt |  3 ++
 tools/perf/builtin-record.c              | 44 +++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index d5217be012d7..1889f66addf2 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -374,6 +374,9 @@ comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-
 In per-thread mode with inheritance mode on (default), samples are captured only when
 the thread executes on the designated CPUs. Default is to monitor all CPUs.
 
+User space tasks can migrate between CPUs, so when tracing selected CPUs,
+a dummy event is created to track sideband for all CPUs.
+
 -B::
 --no-buildid::
 Do not save the build ids of binaries in the perf.data files. This skips
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 83bd1f117191..21c571018148 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -906,10 +906,44 @@ static int record__config_off_cpu(struct record *rec)
 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
 }
 
+static bool record__tracking_system_wide(struct record *rec)
+{
+	struct record_opts *opts = &rec->opts;
+	struct evlist *evlist = rec->evlist;
+	struct evsel *evsel;
+
+	/*
+	 * If all (non-dummy) evsel have exclude_user,
+	 * system_wide is not needed.
+	 *
+	 * all_kernel and all_user will overwrite exclude_kernel and
+	 * exclude_user of attr in evsel__config(), here need to check
+	 * all the three items.
+	 *
+	 * Sideband system wide if one of the following conditions is met:
+	 *
+	 *   - all_user is set, and there is a non-dummy event
+	 *   - all_user and all_kernel are not set, and there is
+	 *     a non-dummy event without exclude_user
+	 */
+	if (opts->all_kernel)
+		return false;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (!evsel__is_dummy_event(evsel)) {
+			if (opts->all_user || !evsel->core.attr.exclude_user)
+				return true;
+		}
+	}
+
+	return false;
+}
+
 static int record__config_tracking_events(struct record *rec)
 {
 	struct record_opts *opts = &rec->opts;
 	struct evlist *evlist = rec->evlist;
+	bool system_wide = false;
 	struct evsel *evsel;
 
 	/*
@@ -919,7 +953,15 @@ static int record__config_tracking_events(struct record *rec)
 	 */
 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
 	    perf_pmus__num_core_pmus() > 1) {
-		evsel = evlist__findnew_tracking_event(evlist, false);
+
+		/*
+		 * User space tasks can migrate between CPUs, so when tracing
+		 * selected CPUs, sideband for all CPUs is still needed.
+		 */
+		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
+			system_wide = true;
+
+		evsel = evlist__findnew_tracking_event(evlist, system_wide);
 		if (!evsel)
 			return -ENOMEM;
 
-- 
2.30.GIT
Re: [PATCH v7 4/6] perf record: Track sideband events for all CPUs when tracing selected CPUs
Posted by Namhyung Kim 2 years, 3 months ago
Hello,

On Fri, Aug 25, 2023 at 8:29 PM Yang Jihong <yangjihong1@huawei.com> wrote:
>
> User space tasks can migrate between CPUs, we need to track side-band
> events for all CPUs.
>
> The specific scenarios are as follows:
>
>          CPU0                                 CPU1
>   perf record -C 0 start
>                               taskA starts to be created and executed
>                                 -> PERF_RECORD_COMM and PERF_RECORD_MMAP
>                                    events only deliver to CPU1
>                               ......
>                                 |
>                           migrate to CPU0
>                                 |
>   Running on CPU0    <----------/
>   ...
>
>   perf record -C 0 stop
>
> Now perf samples the PC of taskA. However, perf does not record the
> PERF_RECORD_COMM and PERF_RECORD_MMAP events of taskA.
> Therefore, the comm and symbols of taskA cannot be parsed.
>
> The solution is to record sideband events for all CPUs when tracing
> selected CPUs. Because this modifies the default behavior, add related
> comments to the perf record man page.
>
> The sys_perf_event_open invoked is as follows:
>
>   # perf --debug verbose=3 record -e cpu-clock -C 1 true
>   <SNIP>
>   Opening: cpu-clock
>   ------------------------------------------------------------
>   perf_event_attr:
>     type                             1 (PERF_TYPE_SOFTWARE)
>     size                             136
>     config                           0 (PERF_COUNT_SW_CPU_CLOCK)
>     { sample_period, sample_freq }   4000
>     sample_type                      IP|TID|TIME|CPU|PERIOD|IDENTIFIER
>     read_format                      ID|LOST
>     disabled                         1
>     inherit                          1
>     freq                             1
>     sample_id_all                    1
>     exclude_guest                    1
>   ------------------------------------------------------------
>   sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 5
>   Opening: dummy:u
>   ------------------------------------------------------------
>   perf_event_attr:
>     type                             1 (PERF_TYPE_SOFTWARE)
>     size                             136
>     config                           0x9 (PERF_COUNT_SW_DUMMY)
>     { sample_period, sample_freq }   1
>     sample_type                      IP|TID|TIME|CPU|IDENTIFIER
>     read_format                      ID|LOST
>     inherit                          1
>     exclude_kernel                   1
>     exclude_hv                       1
>     mmap                             1
>     comm                             1
>     task                             1
>     sample_id_all                    1
>     exclude_guest                    1
>     mmap2                            1
>     comm_exec                        1
>     ksymbol                          1
>     bpf_event                        1
>   ------------------------------------------------------------
>   sys_perf_event_open: pid -1  cpu 0  group_fd -1  flags 0x8 = 6
>   sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 7
>   sys_perf_event_open: pid -1  cpu 2  group_fd -1  flags 0x8 = 9
>   sys_perf_event_open: pid -1  cpu 3  group_fd -1  flags 0x8 = 10
>   sys_perf_event_open: pid -1  cpu 4  group_fd -1  flags 0x8 = 11
>   sys_perf_event_open: pid -1  cpu 5  group_fd -1  flags 0x8 = 12
>   sys_perf_event_open: pid -1  cpu 6  group_fd -1  flags 0x8 = 13
>   sys_perf_event_open: pid -1  cpu 7  group_fd -1  flags 0x8 = 14
>   <SNIP>
>
> Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
> Acked-by: Adrian Hunter <adrian.hunter@intel.com>
> ---
>  tools/perf/Documentation/perf-record.txt |  3 ++
>  tools/perf/builtin-record.c              | 44 +++++++++++++++++++++++-
>  2 files changed, 46 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
> index d5217be012d7..1889f66addf2 100644
> --- a/tools/perf/Documentation/perf-record.txt
> +++ b/tools/perf/Documentation/perf-record.txt
> @@ -374,6 +374,9 @@ comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-
>  In per-thread mode with inheritance mode on (default), samples are captured only when
>  the thread executes on the designated CPUs. Default is to monitor all CPUs.
>
> +User space tasks can migrate between CPUs, so when tracing selected CPUs,
> +a dummy event is created to track sideband for all CPUs.
> +
>  -B::
>  --no-buildid::
>  Do not save the build ids of binaries in the perf.data files. This skips
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 83bd1f117191..21c571018148 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -906,10 +906,44 @@ static int record__config_off_cpu(struct record *rec)
>         return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
>  }
>
> +static bool record__tracking_system_wide(struct record *rec)
> +{
> +       struct record_opts *opts = &rec->opts;
> +       struct evlist *evlist = rec->evlist;
> +       struct evsel *evsel;
> +
> +       /*
> +        * If all (non-dummy) evsel have exclude_user,
> +        * system_wide is not needed.

Maybe I missed some earlier discussion but why is it not
needed when exclude_user is set?  I think it still needs
FORK or COMM at least..

Thanks,
Namhyung


> +        *
> +        * all_kernel and all_user will overwrite exclude_kernel and
> +        * exclude_user of attr in evsel__config(), here need to check
> +        * all the three items.
> +        *
> +        * Sideband system wide if one of the following conditions is met:
> +        *
> +        *   - all_user is set, and there is a non-dummy event
> +        *   - all_user and all_kernel are not set, and there is
> +        *     a non-dummy event without exclude_user
> +        */
> +       if (opts->all_kernel)
> +               return false;
> +
> +       evlist__for_each_entry(evlist, evsel) {
> +               if (!evsel__is_dummy_event(evsel)) {
> +                       if (opts->all_user || !evsel->core.attr.exclude_user)
> +                               return true;
> +               }
> +       }
> +
> +       return false;
> +}
> +
>  static int record__config_tracking_events(struct record *rec)
>  {
>         struct record_opts *opts = &rec->opts;
>         struct evlist *evlist = rec->evlist;
> +       bool system_wide = false;
>         struct evsel *evsel;
>
>         /*
> @@ -919,7 +953,15 @@ static int record__config_tracking_events(struct record *rec)
>          */
>         if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
>             perf_pmus__num_core_pmus() > 1) {
> -               evsel = evlist__findnew_tracking_event(evlist, false);
> +
> +               /*
> +                * User space tasks can migrate between CPUs, so when tracing
> +                * selected CPUs, sideband for all CPUs is still needed.
> +                */
> +               if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
> +                       system_wide = true;
> +
> +               evsel = evlist__findnew_tracking_event(evlist, system_wide);
>                 if (!evsel)
>                         return -ENOMEM;
>
> --
> 2.30.GIT
>
Re: [PATCH v7 4/6] perf record: Track sideband events for all CPUs when tracing selected CPUs
Posted by Yang Jihong 2 years, 3 months ago
Hello,

On 2023/8/26 22:59, Namhyung Kim wrote:
> Hello,
> 
> On Fri, Aug 25, 2023 at 8:29 PM Yang Jihong <yangjihong1@huawei.com> wrote:
>>
>> User space tasks can migrate between CPUs, we need to track side-band
>> events for all CPUs.
>>
>> The specific scenarios are as follows:
>>
>>           CPU0                                 CPU1
>>    perf record -C 0 start
>>                                taskA starts to be created and executed
>>                                  -> PERF_RECORD_COMM and PERF_RECORD_MMAP
>>                                     events only deliver to CPU1
>>                                ......
>>                                  |
>>                            migrate to CPU0
>>                                  |
>>    Running on CPU0    <----------/
>>    ...
>>
>>    perf record -C 0 stop
>>
>> Now perf samples the PC of taskA. However, perf does not record the
>> PERF_RECORD_COMM and PERF_RECORD_MMAP events of taskA.
>> Therefore, the comm and symbols of taskA cannot be parsed.
>>
>> The solution is to record sideband events for all CPUs when tracing
>> selected CPUs. Because this modifies the default behavior, add related
>> comments to the perf record man page.
>>
>> The sys_perf_event_open invoked is as follows:
>>
>>    # perf --debug verbose=3 record -e cpu-clock -C 1 true
>>    <SNIP>
>>    Opening: cpu-clock
>>    ------------------------------------------------------------
>>    perf_event_attr:
>>      type                             1 (PERF_TYPE_SOFTWARE)
>>      size                             136
>>      config                           0 (PERF_COUNT_SW_CPU_CLOCK)
>>      { sample_period, sample_freq }   4000
>>      sample_type                      IP|TID|TIME|CPU|PERIOD|IDENTIFIER
>>      read_format                      ID|LOST
>>      disabled                         1
>>      inherit                          1
>>      freq                             1
>>      sample_id_all                    1
>>      exclude_guest                    1
>>    ------------------------------------------------------------
>>    sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 5
>>    Opening: dummy:u
>>    ------------------------------------------------------------
>>    perf_event_attr:
>>      type                             1 (PERF_TYPE_SOFTWARE)
>>      size                             136
>>      config                           0x9 (PERF_COUNT_SW_DUMMY)
>>      { sample_period, sample_freq }   1
>>      sample_type                      IP|TID|TIME|CPU|IDENTIFIER
>>      read_format                      ID|LOST
>>      inherit                          1
>>      exclude_kernel                   1
>>      exclude_hv                       1
>>      mmap                             1
>>      comm                             1
>>      task                             1
>>      sample_id_all                    1
>>      exclude_guest                    1
>>      mmap2                            1
>>      comm_exec                        1
>>      ksymbol                          1
>>      bpf_event                        1
>>    ------------------------------------------------------------
>>    sys_perf_event_open: pid -1  cpu 0  group_fd -1  flags 0x8 = 6
>>    sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 7
>>    sys_perf_event_open: pid -1  cpu 2  group_fd -1  flags 0x8 = 9
>>    sys_perf_event_open: pid -1  cpu 3  group_fd -1  flags 0x8 = 10
>>    sys_perf_event_open: pid -1  cpu 4  group_fd -1  flags 0x8 = 11
>>    sys_perf_event_open: pid -1  cpu 5  group_fd -1  flags 0x8 = 12
>>    sys_perf_event_open: pid -1  cpu 6  group_fd -1  flags 0x8 = 13
>>    sys_perf_event_open: pid -1  cpu 7  group_fd -1  flags 0x8 = 14
>>    <SNIP>
>>
>> Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
>> Acked-by: Adrian Hunter <adrian.hunter@intel.com>
>> ---
>>   tools/perf/Documentation/perf-record.txt |  3 ++
>>   tools/perf/builtin-record.c              | 44 +++++++++++++++++++++++-
>>   2 files changed, 46 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
>> index d5217be012d7..1889f66addf2 100644
>> --- a/tools/perf/Documentation/perf-record.txt
>> +++ b/tools/perf/Documentation/perf-record.txt
>> @@ -374,6 +374,9 @@ comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-
>>   In per-thread mode with inheritance mode on (default), samples are captured only when
>>   the thread executes on the designated CPUs. Default is to monitor all CPUs.
>>
>> +User space tasks can migrate between CPUs, so when tracing selected CPUs,
>> +a dummy event is created to track sideband for all CPUs.
>> +
>>   -B::
>>   --no-buildid::
>>   Do not save the build ids of binaries in the perf.data files. This skips
>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>> index 83bd1f117191..21c571018148 100644
>> --- a/tools/perf/builtin-record.c
>> +++ b/tools/perf/builtin-record.c
>> @@ -906,10 +906,44 @@ static int record__config_off_cpu(struct record *rec)
>>          return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
>>   }
>>
>> +static bool record__tracking_system_wide(struct record *rec)
>> +{
>> +       struct record_opts *opts = &rec->opts;
>> +       struct evlist *evlist = rec->evlist;
>> +       struct evsel *evsel;
>> +
>> +       /*
>> +        * If all (non-dummy) evsel have exclude_user,
>> +        * system_wide is not needed.
> 
> Maybe I missed some earlier discussion but why is it not
> needed when exclude_user is set?  I think it still needs
> FORK or COMM at least..
> 

This is Adrian's suggestion earlier, I think it's probably because if 
exclude_user is set, MMAP information is not needed, that's my guess.

However, as you said, even if exclude_user is set, at least FORK and 
COMM are required.

Therefore, the conditions here need to be changed to:
"system_wide is need as long as there is a non-dummy event."

@Adrian, is this change okay?

Thanks,
Yang
Re: [PATCH v7 4/6] perf record: Track sideband events for all CPUs when tracing selected CPUs
Posted by Adrian Hunter 2 years, 3 months ago
On 28/08/23 06:03, Yang Jihong wrote:
> Hello,
> 
> On 2023/8/26 22:59, Namhyung Kim wrote:
>> Hello,
>>
>> On Fri, Aug 25, 2023 at 8:29 PM Yang Jihong <yangjihong1@huawei.com> wrote:
>>>
>>> User space tasks can migrate between CPUs, we need to track side-band
>>> events for all CPUs.
>>>
>>> The specific scenarios are as follows:
>>>
>>>           CPU0                                 CPU1
>>>    perf record -C 0 start
>>>                                taskA starts to be created and executed
>>>                                  -> PERF_RECORD_COMM and PERF_RECORD_MMAP
>>>                                     events only deliver to CPU1
>>>                                ......
>>>                                  |
>>>                            migrate to CPU0
>>>                                  |
>>>    Running on CPU0    <----------/
>>>    ...
>>>
>>>    perf record -C 0 stop
>>>
>>> Now perf samples the PC of taskA. However, perf does not record the
>>> PERF_RECORD_COMM and PERF_RECORD_MMAP events of taskA.
>>> Therefore, the comm and symbols of taskA cannot be parsed.
>>>
>>> The solution is to record sideband events for all CPUs when tracing
>>> selected CPUs. Because this modifies the default behavior, add related
>>> comments to the perf record man page.
>>>
>>> The sys_perf_event_open invoked is as follows:
>>>
>>>    # perf --debug verbose=3 record -e cpu-clock -C 1 true
>>>    <SNIP>
>>>    Opening: cpu-clock
>>>    ------------------------------------------------------------
>>>    perf_event_attr:
>>>      type                             1 (PERF_TYPE_SOFTWARE)
>>>      size                             136
>>>      config                           0 (PERF_COUNT_SW_CPU_CLOCK)
>>>      { sample_period, sample_freq }   4000
>>>      sample_type                      IP|TID|TIME|CPU|PERIOD|IDENTIFIER
>>>      read_format                      ID|LOST
>>>      disabled                         1
>>>      inherit                          1
>>>      freq                             1
>>>      sample_id_all                    1
>>>      exclude_guest                    1
>>>    ------------------------------------------------------------
>>>    sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 5
>>>    Opening: dummy:u
>>>    ------------------------------------------------------------
>>>    perf_event_attr:
>>>      type                             1 (PERF_TYPE_SOFTWARE)
>>>      size                             136
>>>      config                           0x9 (PERF_COUNT_SW_DUMMY)
>>>      { sample_period, sample_freq }   1
>>>      sample_type                      IP|TID|TIME|CPU|IDENTIFIER
>>>      read_format                      ID|LOST
>>>      inherit                          1
>>>      exclude_kernel                   1
>>>      exclude_hv                       1
>>>      mmap                             1
>>>      comm                             1
>>>      task                             1
>>>      sample_id_all                    1
>>>      exclude_guest                    1
>>>      mmap2                            1
>>>      comm_exec                        1
>>>      ksymbol                          1
>>>      bpf_event                        1
>>>    ------------------------------------------------------------
>>>    sys_perf_event_open: pid -1  cpu 0  group_fd -1  flags 0x8 = 6
>>>    sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 7
>>>    sys_perf_event_open: pid -1  cpu 2  group_fd -1  flags 0x8 = 9
>>>    sys_perf_event_open: pid -1  cpu 3  group_fd -1  flags 0x8 = 10
>>>    sys_perf_event_open: pid -1  cpu 4  group_fd -1  flags 0x8 = 11
>>>    sys_perf_event_open: pid -1  cpu 5  group_fd -1  flags 0x8 = 12
>>>    sys_perf_event_open: pid -1  cpu 6  group_fd -1  flags 0x8 = 13
>>>    sys_perf_event_open: pid -1  cpu 7  group_fd -1  flags 0x8 = 14
>>>    <SNIP>
>>>
>>> Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
>>> Acked-by: Adrian Hunter <adrian.hunter@intel.com>
>>> ---
>>>   tools/perf/Documentation/perf-record.txt |  3 ++
>>>   tools/perf/builtin-record.c              | 44 +++++++++++++++++++++++-
>>>   2 files changed, 46 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
>>> index d5217be012d7..1889f66addf2 100644
>>> --- a/tools/perf/Documentation/perf-record.txt
>>> +++ b/tools/perf/Documentation/perf-record.txt
>>> @@ -374,6 +374,9 @@ comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-
>>>   In per-thread mode with inheritance mode on (default), samples are captured only when
>>>   the thread executes on the designated CPUs. Default is to monitor all CPUs.
>>>
>>> +User space tasks can migrate between CPUs, so when tracing selected CPUs,
>>> +a dummy event is created to track sideband for all CPUs.
>>> +
>>>   -B::
>>>   --no-buildid::
>>>   Do not save the build ids of binaries in the perf.data files. This skips
>>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>>> index 83bd1f117191..21c571018148 100644
>>> --- a/tools/perf/builtin-record.c
>>> +++ b/tools/perf/builtin-record.c
>>> @@ -906,10 +906,44 @@ static int record__config_off_cpu(struct record *rec)
>>>          return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
>>>   }
>>>
>>> +static bool record__tracking_system_wide(struct record *rec)
>>> +{
>>> +       struct record_opts *opts = &rec->opts;
>>> +       struct evlist *evlist = rec->evlist;
>>> +       struct evsel *evsel;
>>> +
>>> +       /*
>>> +        * If all (non-dummy) evsel have exclude_user,
>>> +        * system_wide is not needed.
>>
>> Maybe I missed some earlier discussion but why is it not
>> needed when exclude_user is set?  I think it still needs
>> FORK or COMM at least..
>>
> 
> This is Adrian's suggestion earlier, I think it's probably because if exclude_user is set, MMAP information is not needed, that's my guess.
> 
> However, as you said, even if exclude_user is set, at least FORK and COMM are required.
> 
> Therefore, the conditions here need to be changed to:
> "system_wide is need as long as there is a non-dummy event."
> 
> @Adrian, is this change okay?

If you wish.  I think we use FORK to get mappings, so not sure it
would be needed.  There is PID, TID so COMM is not essential.
There are FORK, COMM etc from the CPUs being traced of course.

Re: [PATCH v7 4/6] perf record: Track sideband events for all CPUs when tracing selected CPUs
Posted by Yang Jihong 2 years, 3 months ago
Hello,

On 2023/8/28 19:25, Adrian Hunter wrote:
> On 28/08/23 06:03, Yang Jihong wrote:
>> Hello,
>>
>> On 2023/8/26 22:59, Namhyung Kim wrote:
>>> Hello,
>>>
>>> On Fri, Aug 25, 2023 at 8:29 PM Yang Jihong <yangjihong1@huawei.com> wrote:
>>>>
>>>> User space tasks can migrate between CPUs, we need to track side-band
>>>> events for all CPUs.
>>>>
>>>> The specific scenarios are as follows:
>>>>
>>>>            CPU0                                 CPU1
>>>>     perf record -C 0 start
>>>>                                 taskA starts to be created and executed
>>>>                                   -> PERF_RECORD_COMM and PERF_RECORD_MMAP
>>>>                                      events only deliver to CPU1
>>>>                                 ......
>>>>                                   |
>>>>                             migrate to CPU0
>>>>                                   |
>>>>     Running on CPU0    <----------/
>>>>     ...
>>>>
>>>>     perf record -C 0 stop
>>>>
>>>> Now perf samples the PC of taskA. However, perf does not record the
>>>> PERF_RECORD_COMM and PERF_RECORD_MMAP events of taskA.
>>>> Therefore, the comm and symbols of taskA cannot be parsed.
>>>>
>>>> The solution is to record sideband events for all CPUs when tracing
>>>> selected CPUs. Because this modifies the default behavior, add related
>>>> comments to the perf record man page.
>>>>
>>>> The sys_perf_event_open invoked is as follows:
>>>>
>>>>     # perf --debug verbose=3 record -e cpu-clock -C 1 true
>>>>     <SNIP>
>>>>     Opening: cpu-clock
>>>>     ------------------------------------------------------------
>>>>     perf_event_attr:
>>>>       type                             1 (PERF_TYPE_SOFTWARE)
>>>>       size                             136
>>>>       config                           0 (PERF_COUNT_SW_CPU_CLOCK)
>>>>       { sample_period, sample_freq }   4000
>>>>       sample_type                      IP|TID|TIME|CPU|PERIOD|IDENTIFIER
>>>>       read_format                      ID|LOST
>>>>       disabled                         1
>>>>       inherit                          1
>>>>       freq                             1
>>>>       sample_id_all                    1
>>>>       exclude_guest                    1
>>>>     ------------------------------------------------------------
>>>>     sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 5
>>>>     Opening: dummy:u
>>>>     ------------------------------------------------------------
>>>>     perf_event_attr:
>>>>       type                             1 (PERF_TYPE_SOFTWARE)
>>>>       size                             136
>>>>       config                           0x9 (PERF_COUNT_SW_DUMMY)
>>>>       { sample_period, sample_freq }   1
>>>>       sample_type                      IP|TID|TIME|CPU|IDENTIFIER
>>>>       read_format                      ID|LOST
>>>>       inherit                          1
>>>>       exclude_kernel                   1
>>>>       exclude_hv                       1
>>>>       mmap                             1
>>>>       comm                             1
>>>>       task                             1
>>>>       sample_id_all                    1
>>>>       exclude_guest                    1
>>>>       mmap2                            1
>>>>       comm_exec                        1
>>>>       ksymbol                          1
>>>>       bpf_event                        1
>>>>     ------------------------------------------------------------
>>>>     sys_perf_event_open: pid -1  cpu 0  group_fd -1  flags 0x8 = 6
>>>>     sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8 = 7
>>>>     sys_perf_event_open: pid -1  cpu 2  group_fd -1  flags 0x8 = 9
>>>>     sys_perf_event_open: pid -1  cpu 3  group_fd -1  flags 0x8 = 10
>>>>     sys_perf_event_open: pid -1  cpu 4  group_fd -1  flags 0x8 = 11
>>>>     sys_perf_event_open: pid -1  cpu 5  group_fd -1  flags 0x8 = 12
>>>>     sys_perf_event_open: pid -1  cpu 6  group_fd -1  flags 0x8 = 13
>>>>     sys_perf_event_open: pid -1  cpu 7  group_fd -1  flags 0x8 = 14
>>>>     <SNIP>
>>>>
>>>> Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
>>>> Acked-by: Adrian Hunter <adrian.hunter@intel.com>
>>>> ---
>>>>    tools/perf/Documentation/perf-record.txt |  3 ++
>>>>    tools/perf/builtin-record.c              | 44 +++++++++++++++++++++++-
>>>>    2 files changed, 46 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
>>>> index d5217be012d7..1889f66addf2 100644
>>>> --- a/tools/perf/Documentation/perf-record.txt
>>>> +++ b/tools/perf/Documentation/perf-record.txt
>>>> @@ -374,6 +374,9 @@ comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-
>>>>    In per-thread mode with inheritance mode on (default), samples are captured only when
>>>>    the thread executes on the designated CPUs. Default is to monitor all CPUs.
>>>>
>>>> +User space tasks can migrate between CPUs, so when tracing selected CPUs,
>>>> +a dummy event is created to track sideband for all CPUs.
>>>> +
>>>>    -B::
>>>>    --no-buildid::
>>>>    Do not save the build ids of binaries in the perf.data files. This skips
>>>> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
>>>> index 83bd1f117191..21c571018148 100644
>>>> --- a/tools/perf/builtin-record.c
>>>> +++ b/tools/perf/builtin-record.c
>>>> @@ -906,10 +906,44 @@ static int record__config_off_cpu(struct record *rec)
>>>>           return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
>>>>    }
>>>>
>>>> +static bool record__tracking_system_wide(struct record *rec)
>>>> +{
>>>> +       struct record_opts *opts = &rec->opts;
>>>> +       struct evlist *evlist = rec->evlist;
>>>> +       struct evsel *evsel;
>>>> +
>>>> +       /*
>>>> +        * If all (non-dummy) evsel have exclude_user,
>>>> +        * system_wide is not needed.
>>>
>>> Maybe I missed some earlier discussion but why is it not
>>> needed when exclude_user is set?  I think it still needs
>>> FORK or COMM at least..
>>>
>>
>> This is Adrian's suggestion earlier, I think it's probably because if exclude_user is set, MMAP information is not needed, that's my guess.
>>
>> However, as you said, even if exclude_user is set, at least FORK and COMM are required.
>>
>> Therefore, the conditions here need to be changed to:
>> "system_wide is need as long as there is a non-dummy event."
>>
>> @Adrian, is this change okay?
> 
> If you wish.  I think we use FORK to get mappings, so not sure it
> would be needed.  There is PID, TID so COMM is not essential.
> There are FORK, COMM etc from the CPUs being traced of course.
> 
I think COMM is also necessary. If there is no COMM, the perf report 
cannot display the comm of the process migrated from other core.
The test result is as follows:

   # perf report --stdio
   # To display the perf.data header info, please use 
--header/--header-only options.
   #
   #
   # Total Lost Samples: 0
   #
   # Samples: 33  of event 'cpu-clock:khppp'
   # Event count (approx.): 8250000
   #
   # Overhead  Command          Shared Object      Symbol
   # ........  ...............  ................. 
.......................................
   #
       15.15%  test3            [kernel.kallsyms]  [k] finish_task_switch
        9.09%  kworker/1:0-eve  [kernel.kallsyms]  [k] _raw_spin_unlock_irq
        3.03%  :934             [kernel.kallsyms]  [k] blk_done_softirq
        3.03%  :934             [kernel.kallsyms]  [k] finish_task_switch
   <SNIP>

Thanks,
Yang