tools/perf/Documentation/perf-amd-ibs.txt | 9 + tools/perf/Documentation/perf-c2c.txt | 11 +- tools/perf/Documentation/perf-mem.txt | 13 +- tools/perf/arch/x86/include/arch-tests.h | 1 + tools/perf/arch/x86/tests/Build | 1 + tools/perf/arch/x86/tests/amd-ibs-period.c | 1001 ++++++++++++++++++++ tools/perf/arch/x86/tests/arch-tests.c | 2 + tools/perf/arch/x86/util/mem-events.c | 6 + tools/perf/arch/x86/util/mem-events.h | 1 + tools/perf/arch/x86/util/pmu.c | 20 +- tools/perf/tests/shell/test_data_symbol.sh | 29 +- tools/perf/util/amd-sample-raw.c | 77 +- tools/perf/util/pmu.c | 11 + tools/perf/util/pmu.h | 2 + 14 files changed, 1160 insertions(+), 24 deletions(-) create mode 100644 tools/perf/arch/x86/tests/amd-ibs-period.c
IBS on Zen5: - Introduced Load Latency filtering capability. - Shows DTLB and page size information differently from prior generations. Kernel changes for these enhancements are already upstream. So, resending tools changes separately. Patches are prepared on perf-tools-next/perf-tools-next (85447f68a1e3). v3: https://lore.kernel.org/r/20250205060547.1337-1-ravi.bangoria@amd.com v3->v4: - Remove kernel changes. - Improve IBS sample period unit test Ravi Bangoria (4): perf amd ibs: Add Load Latency bits in raw dump perf amd ibs: Incorporate Zen5 DTLB and PageSize information perf mem/c2c amd: Add ldlat support perf test amd ibs: Add sample period unit test tools/perf/Documentation/perf-amd-ibs.txt | 9 + tools/perf/Documentation/perf-c2c.txt | 11 +- tools/perf/Documentation/perf-mem.txt | 13 +- tools/perf/arch/x86/include/arch-tests.h | 1 + tools/perf/arch/x86/tests/Build | 1 + tools/perf/arch/x86/tests/amd-ibs-period.c | 1001 ++++++++++++++++++++ tools/perf/arch/x86/tests/arch-tests.c | 2 + tools/perf/arch/x86/util/mem-events.c | 6 + tools/perf/arch/x86/util/mem-events.h | 1 + tools/perf/arch/x86/util/pmu.c | 20 +- tools/perf/tests/shell/test_data_symbol.sh | 29 +- tools/perf/util/amd-sample-raw.c | 77 +- tools/perf/util/pmu.c | 11 + tools/perf/util/pmu.h | 2 + 14 files changed, 1160 insertions(+), 24 deletions(-) create mode 100644 tools/perf/arch/x86/tests/amd-ibs-period.c -- 2.43.0
On Tue, Apr 29, 2025 at 03:59:34AM +0000, Ravi Bangoria wrote:
> IBS on Zen5:
> - Introduced Load Latency filtering capability.
> - Shows DTLB and page size information differently from prior generations.
>
> Kernel changes for these enhancements are already upstream. So, resending
> tools changes separately.
>
> Patches are prepared on perf-tools-next/perf-tools-next (85447f68a1e3).
>
> v3: https://lore.kernel.org/r/20250205060547.1337-1-ravi.bangoria@amd.com
> v3->v4:
> - Remove kernel changes.
> - Improve IBS sample period unit test
Preliminary tests with what is in tmp.perf-tools-next:
root@number:~# perf mem record find / > /dev/null
[ perf record: Woken up 5 times to write data ]
[ perf record: Captured and wrote 1.992 MB perf.data (31824 samples) ]
root@number:~# perf mem report -s mem --stdio
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 31K of event 'ibs_op//'
# Total weight : 66561
# Sort order : mem
#
# Overhead Samples Memory access
# ........ ............ .......................................
#
36.51% 456 L2 hit
30.26% 20141 N/A
16.75% 11149 L1 hit
10.08% 18 RAM hit
6.39% 52 L3 hit
0.01% 8 LFB/MAB hit
#
# (Tip: To collect Processor Trace with samples use perf record -e '{intel_pt//,cycles}' ; perf script --call-trace or --insn-trace --xed -F +ipc (remove --xed if no xed))
#
root@number:~#
root@number:~# perf evlist -v
ibs_op//: type: 11 (ibs_op), size: 136, config: 0, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|PERIOD|DATA_SRC|WEIGHT_STRUCT, read_format: ID|LOST, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, mmap_data: 1, sample_id_all: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1
root@number:~#
root@number:~# perf report --header-only | head -25
# ========
# captured on : Tue Apr 29 22:54:04 2025
# header version : 1
# data offset : 512
# data size : 668520
# feat offset : 669032
# hostname : number
# os release : 6.15.0-rc4+
# perf version : 6.15.rc2.g3e8278077117
# arch : x86_64
# nrcpus online : 32
# nrcpus avail : 32
# cpudesc : AMD Ryzen 9 9950X3D 16-Core Processor
# cpuid : AuthenticAMD,26,68,0
# total memory : 31928240 kB
# cmdline : /home/acme/bin/perf mem record find /
# event : name = ibs_op//, , id = { 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335 }, type = 11 (ibs_op), size = 136, config = 0, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|ADDR|PERIOD|DATA_SRC|WEIGHT_STRUCT, read_format = ID|LOST, disabled = 1, inherit = 1, mmap = 1, comm = 1, freq = 1, enable_on_exec = 1, task = 1, mmap_data = 1, sample_id_all = 1, mmap2 = 1, comm_exec = 1, ksymbol = 1, bpf_event = 1
# CPU_TOPOLOGY info available, use -I to display
# NUMA_TOPOLOGY info available, use -I to display
# pmu mappings: cpu = 4, amd_df = 12, amd_iommu_0 = 15, amd_l3 = 13, amd_umc_0 = 14, breakpoint = 5, hwmon_amdgpu = 4294901761, hwmon_k10temp = 4294901762, hwmon_nvme = 4294901760, hwmon_r8169_0_e00_00 = 4294901763, ibs_fetch = 10, ibs_op = 11, kprobe = 8, msr = 16, power = 17, power_core = 18, software = 1, tool = 4294967294, tracepoint = 2, uprobe = 9
# CACHE info available, use -I to display
# time of first sample : 244.312475
# time of last sample : 246.801803
# sample duration : 2489.328 ms
# MEM_TOPOLOGY info available, use -I to display
root@number:~#
root@number:~# perf report | head
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 9K of event 'ibs_op//'
# Event count (approx.): 12948758501
#
# Overhead Command Shared Object Symbol
# ........ ....... ......................... ........................................
root@number:~# perf report | head -20
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 9K of event 'ibs_op//'
# Event count (approx.): 12948758501
#
# Overhead Command Shared Object Symbol
# ........ ....... ......................... ........................................
#
6.11% find [kernel.kallsyms] [k] btrfs_bin_search
4.91% find [kernel.kallsyms] [k] filldir64
4.77% find find [.] consider_visiting
3.95% find [kernel.kallsyms] [k] memcpy
2.76% find [kernel.kallsyms] [k] entry_SYSCALL_64
2.59% find libc.so.6 [.] __printf_buffer
2.52% find [kernel.kallsyms] [k] btrfs_getattr
2.09% find [kernel.kallsyms] [k] pid_delete_dentry
1.88% find libc.so.6 [.] msort_with_tmp.part.0
root@number:~#
root@number:~# perf annotate -v --stdio2 btrfs_bin_search
build id event received for [vdso]: 6dc5707510cc7434be3d6cb4dc6bae12881efda3 [20]
build id event received for /usr/bin/find: 3804e1e1214a39a975e093a79ec04961743ef5c5 [20]
build id event received for /usr/lib64/libc.so.6: 2b3c02fe7e4d3811767175b6f323692a10a4e116 [20]
build id event received for [kernel.kallsyms]: d391f0e79126801bc8a8f907e763de7979941712 [20]
Looking at the vmlinux_path (8 entries long)
Using /lib/modules/6.15.0-rc4+/build/vmlinux for symbols
read_gnu_debugdata: using .gnu_debugdata of /usr/bin/find
symbol__disassemble: filename=/lib/modules/6.15.0-rc4+/build/vmlinux, sym=btrfs_bin_search, start=0xffffffffac97e890, end=0xffffffffac97ead9
annotating [0x2e87fbf0] /lib/modules/6.15.0-rc4+/build/vmlinux : [0x2fa7f070] btrfs_bin_search
Disassembled with llvm
Samples: 585 of event 'ibs_op//', 4000 Hz, Event count (approx.): 790819874, [percent: local period]
btrfs_bin_search() /lib/modules/6.15.0-rc4+/build/vmlinux
Percent 0xffffffff8197e890 <btrfs_bin_search>:
0.17 endbr64
→ callq __fentry__
0.16 pushq %r15
0.18 movq %rdx,%r15
pushq %r14
pushq %r13
0.18 pushq %r12
0.34 pushq %rbp
movl %esi,%ebp
pushq %rbx
0.35 movq %rdi,%rbx
subq $0x48,%rsp
movq (%rdi),%r9
movq %rcx,(%rsp)
0.34 movq %r9,%rdx
andl $0xfff,%edx
movq __stack_chk_guard,%r14
0.18 movq %r14,0x40(%rsp)
0.33 movl %esi,%r14d
0.17 movq 0x70(%rdi),%rsi
movq %rsi,%rax
subq vmemmap_base,%rax
sarq $0x6, %rax
0.17 shlq $0xc, %rax
0.15 addq page_offset_base,%rax
0.17 addq %rdx,%rax
movl 0x60(%rax),%r13d
0.17 cmpl %ebp,%r13d
→ jb btrfs_bin_search.cold
cmpb $0x1,0x64(%rax)
sbbl %r12d,%r12d
andl $-0x8,%r12d
0.18 addl $0x21,%r12d
cmpl %r13d,%r14d
↓ jae 20f
1.04 84: leal (%r14,%r13),%ebp
0.81 movb $0x0,0x3f(%rsp)
1.20 movslq 0xc(%rbx),%r10
0.66 movl $0xfff,%r11d
1.06 movq $0x0,0x2f(%rsp)
0.85 shrl %ebp
1.35 movq $0x0,0x37(%rsp)
1.93 movl %ebp,%eax
1.04 movq (%rsi),%rdx
2.20 imull %r12d,%eax
10.77 cltq
10.26 addq $0x65,%rax
3.11 addq %rax,%r9
0.68 andl $0x40,%edx
↓ je e3
movq 0x40(%rsi),%rsi
movl $0x1000,%r11d
movzbl %sil,%ecx
shlq %cl, %r11
subq $0x1,%r11
root@number:~#
I'll do more tests tomorrow and try some of the workloads that Joe uses.
Thanks a lot!
- Arnaldo
On 30-Apr-25 7:30 AM, Arnaldo Carvalho de Melo wrote: > On Tue, Apr 29, 2025 at 03:59:34AM +0000, Ravi Bangoria wrote: >> IBS on Zen5: >> - Introduced Load Latency filtering capability. >> - Shows DTLB and page size information differently from prior generations. >> >> Kernel changes for these enhancements are already upstream. So, resending >> tools changes separately. >> >> Patches are prepared on perf-tools-next/perf-tools-next (85447f68a1e3). >> >> v3: https://lore.kernel.org/r/20250205060547.1337-1-ravi.bangoria@amd.com >> v3->v4: >> - Remove kernel changes. >> - Improve IBS sample period unit test > > Preliminary tests with what is in tmp.perf-tools-next: [...] > I'll do more tests tomorrow and try some of the workloads that Joe uses. Gentle ping, Arnaldo! Thanks, Ravi
© 2016 - 2026 Red Hat, Inc.