[PATCH v1 22/23] perf unwind-libdw: Don't discard loaded ELF/Dwarf after every unwind

Ian Rogers posted 23 patches 3 weeks, 1 day ago
[PATCH v1 22/23] perf unwind-libdw: Don't discard loaded ELF/Dwarf after every unwind
Posted by Ian Rogers 3 weeks, 1 day ago
The unwind-libdw dwfl has ELF binaries associated with mmap
addresses. Experimenting with using the per dso dwfl it is required to
alter the address to be 0 based variant. Unfortunately libdwfl doesn't
allow a single unwind and then an update to the return address to be 0
based as there are assertions that registers aren't updated once an
unwind has started, etc.

As removing the dwfl didn't prove possible, an alternative is to just
not discard the dwfl when the unwind ends. The dwfl is valid for a
process unless a dso is loaded at the same address as a previous
one. So keep the dwfl with the maps, invalidate it if a map is removed
(in case a new map replaces it) and recycle the dwfl in the unwinding
code. A wrinkly in the implementation of this is that the attached
thread argument is remembered by the dwfl and so it needs to be a
pointer to memory that also persists with the dwfl (struct
dwfl_ui_thread_info in the code).

Recording 10 seconds of system wide data with --call-graph=dwarf and
then processing with perf report shows a total runtime improvement
from 41.583s to 2.279s (an 18x speedup).

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/util/maps.c         | 36 +++++++++++++-
 tools/perf/util/maps.h         |  4 ++
 tools/perf/util/unwind-libdw.c | 90 +++++++++++++++++++++++++---------
 tools/perf/util/unwind-libdw.h |  9 +++-
 4 files changed, 112 insertions(+), 27 deletions(-)

diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index c321d4f4d846..8ccc46d515b6 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -10,6 +10,7 @@
 #include "thread.h"
 #include "ui/ui.h"
 #include "unwind.h"
+#include "unwind-libdw.h"
 #include <internal/rc_check.h>
 
 /*
@@ -39,6 +40,9 @@ DECLARE_RC_STRUCT(maps) {
 #ifdef HAVE_LIBUNWIND_SUPPORT
 	void		*addr_space;
 	const struct unwind_libunwind_ops *unwind_libunwind_ops;
+#endif
+#ifdef HAVE_LIBDW_SUPPORT
+	void		*libdw_addr_space_dwfl;
 #endif
 	refcount_t	 refcnt;
 	/**
@@ -203,6 +207,17 @@ void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libun
 	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = ops;
 }
 #endif
+#ifdef HAVE_LIBDW_SUPPORT
+void *maps__libdw_addr_space_dwfl(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl;
+}
+
+void maps__set_libdw_addr_space_dwfl(struct maps *maps, void *dwfl)
+{
+	RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl = dwfl;
+}
+#endif
 
 static struct rw_semaphore *maps__lock(struct maps *maps)
 {
@@ -218,6 +233,9 @@ static void maps__init(struct maps *maps, struct machine *machine)
 #ifdef HAVE_LIBUNWIND_SUPPORT
 	RC_CHK_ACCESS(maps)->addr_space = NULL;
 	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = NULL;
+#endif
+#ifdef HAVE_LIBDW_SUPPORT
+	RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl = NULL;
 #endif
 	refcount_set(maps__refcnt(maps), 1);
 	RC_CHK_ACCESS(maps)->nr_maps = 0;
@@ -240,6 +258,9 @@ static void maps__exit(struct maps *maps)
 	zfree(&maps_by_address);
 	zfree(&maps_by_name);
 	unwind__finish_access(maps);
+#ifdef HAVE_LIBDW_SUPPORT
+	libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
 }
 
 struct maps *maps__new(struct machine *machine)
@@ -549,6 +570,9 @@ void maps__remove(struct maps *maps, struct map *map)
 	__maps__remove(maps, map);
 	check_invariants(maps);
 	up_write(maps__lock(maps));
+#ifdef HAVE_LIBDW_SUPPORT
+	libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
 }
 
 bool maps__empty(struct maps *maps)
@@ -604,18 +628,26 @@ int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data)
 void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data)
 {
 	struct map **maps_by_address;
+	bool removed = false;
 
 	down_write(maps__lock(maps));
 
 	maps_by_address = maps__maps_by_address(maps);
 	for (unsigned int i = 0; i < maps__nr_maps(maps);) {
-		if (cb(maps_by_address[i], data))
+		if (cb(maps_by_address[i], data)) {
 			__maps__remove(maps, maps_by_address[i]);
-		else
+			removed = true;
+		} else {
 			i++;
+		}
 	}
 	check_invariants(maps);
 	up_write(maps__lock(maps));
+	if (removed) {
+#ifdef HAVE_LIBDW_SUPPORT
+		libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
+	}
 }
 
 struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp)
diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h
index d9aa62ed968a..20c52084ba9e 100644
--- a/tools/perf/util/maps.h
+++ b/tools/perf/util/maps.h
@@ -52,6 +52,10 @@ void maps__set_addr_space(struct maps *maps, void *addr_space);
 const struct unwind_libunwind_ops *maps__unwind_libunwind_ops(const struct maps *maps);
 void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libunwind_ops *ops);
 #endif
+#ifdef HAVE_LIBDW_SUPPORT
+void *maps__libdw_addr_space_dwfl(const struct maps *maps);
+void maps__set_libdw_addr_space_dwfl(struct maps *maps, void *dwfl);
+#endif
 
 size_t maps__fprintf(struct maps *maps, FILE *fp);
 
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index e0321043af88..c1646ef5f971 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -20,6 +20,17 @@
 #include "callchain.h"
 #include "util/env.h"
 
+/*
+ * The dwfl thread argument passed to functions like memory_read. Memory has to
+ * be allocated to persist of multiple uses of the dwfl.
+ */
+struct dwfl_ui_thread_info {
+	/* Back link to the dwfl. */
+	Dwfl *dwfl;
+	/* The current unwind info, only 1 is supported. */
+	struct unwind_info *ui;
+};
+
 static char *debuginfo_path;
 
 static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata,
@@ -35,6 +46,19 @@ static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata,
 	return -1;
 }
 
+void libdw__invalidate_dwfl(struct maps *maps, void *arg)
+{
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+
+	if (!dwfl_ui_ti)
+		return;
+
+	assert(dwfl_ui_ti->ui == NULL);
+	maps__set_libdw_addr_space_dwfl(maps, NULL);
+	dwfl_end(dwfl_ui_ti->dwfl);
+	free(dwfl_ui_ti);
+}
+
 static const Dwfl_Callbacks offline_callbacks = {
 	.find_debuginfo		= __find_debuginfo,
 	.debuginfo_path		= &debuginfo_path,
@@ -187,7 +211,8 @@ static int access_dso_mem(struct unwind_info *ui, Dwarf_Addr addr,
 static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *result,
 			void *arg)
 {
-	struct unwind_info *ui = arg;
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+	struct unwind_info *ui = dwfl_ui_ti->ui;
 	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine);
 	struct stack_dump *stack = &ui->sample->user_stack;
 	u64 start, end;
@@ -228,7 +253,8 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 
 static bool libdw_set_initial_registers(Dwfl_Thread *thread, void *arg)
 {
-	struct unwind_info *ui = arg;
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+	struct unwind_info *ui = dwfl_ui_ti->ui;
 	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
 	Dwarf_Word *dwarf_regs;
 	int max_dwarf_reg = 0;
@@ -320,33 +346,50 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 			int max_stack,
 			bool best_effort)
 {
-	struct machine *machine = maps__machine(thread__maps(thread));
+	struct maps *maps = thread__maps(thread);
+	struct machine *machine = maps__machine(maps);
 	uint16_t e_machine = thread__e_machine(thread, machine);
-	struct unwind_info *ui, ui_buf = {
-		.sample		= data,
-		.thread		= thread,
-		.machine	= machine,
-		.cb		= cb,
-		.arg		= arg,
-		.max_stack	= max_stack,
-		.e_machine	= e_machine,
-		.best_effort    = best_effort
-	};
+	struct dwfl_ui_thread_info *dwfl_ui_ti;
+	static struct unwind_info *ui;
+	Dwfl *dwfl;
 	Dwarf_Word ip;
 	int err = -EINVAL, i;
 
 	if (!data->user_regs || !data->user_regs->regs)
 		return -EINVAL;
 
-	ui = zalloc(sizeof(ui_buf) + sizeof(ui_buf.entries[0]) * max_stack);
+	ui = zalloc(sizeof(*ui) + sizeof(ui->entries[0]) * max_stack);
 	if (!ui)
 		return -ENOMEM;
 
-	*ui = ui_buf;
+	*ui = (struct unwind_info){
+		.sample		= data,
+		.thread		= thread,
+		.machine	= machine,
+		.cb		= cb,
+		.arg		= arg,
+		.max_stack	= max_stack,
+		.e_machine	= e_machine,
+		.best_effort    = best_effort
+	};
 
-	ui->dwfl = dwfl_begin(&offline_callbacks);
-	if (!ui->dwfl)
-		goto out;
+	dwfl_ui_ti = maps__libdw_addr_space_dwfl(maps);
+	if (dwfl_ui_ti) {
+		dwfl = dwfl_ui_ti->dwfl;
+	} else {
+		dwfl_ui_ti = zalloc(sizeof(*dwfl_ui_ti));
+		dwfl = dwfl_begin(&offline_callbacks);
+		if (!dwfl)
+			goto out;
+
+		dwfl_ui_ti->dwfl = dwfl;
+		maps__set_libdw_addr_space_dwfl(maps, dwfl_ui_ti);
+	}
+	assert(dwfl_ui_ti->ui == NULL);
+	assert(dwfl_ui_ti->dwfl == dwfl);
+	assert(dwfl_ui_ti == maps__libdw_addr_space_dwfl(maps));
+	dwfl_ui_ti->ui = ui;
+	ui->dwfl = dwfl;
 
 	err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(e_machine));
 	if (err)
@@ -356,11 +399,12 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (err)
 		goto out;
 
-	err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks, ui);
-	if (err)
-		goto out;
+	dwfl_attach_state(dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks,
+			  /* Dwfl thread function argument*/dwfl_ui_ti);
+	// Ignore thread already attached error.
 
-	err = dwfl_getthread_frames(ui->dwfl, thread__tid(thread), frame_callback, ui);
+	err = dwfl_getthread_frames(dwfl, thread__tid(thread), frame_callback,
+				    /* Dwfl frame function argument*/ui);
 
 	if (err && ui->max_stack != max_stack)
 		err = 0;
@@ -384,7 +428,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	for (i = 0; i < ui->idx; i++)
 		map_symbol__exit(&ui->entries[i].ms);
 
-	dwfl_end(ui->dwfl);
+	dwfl_ui_ti->ui = NULL;
 	free(ui);
 	return 0;
 }
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 9c5b5fcaaae8..3dec0ab8bd50 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -2,15 +2,17 @@
 #ifndef __PERF_UNWIND_LIBDW_H
 #define __PERF_UNWIND_LIBDW_H
 
-#include <elfutils/libdwfl.h>
+#include <stdint.h>
 #include "unwind.h"
 
 struct machine;
 struct perf_sample;
 struct thread;
 
+#ifdef HAVE_LIBDW_SUPPORT
+
 struct unwind_info {
-	Dwfl			*dwfl;
+	void			*dwfl;
 	struct perf_sample      *sample;
 	struct machine          *machine;
 	struct thread           *thread;
@@ -23,4 +25,7 @@ struct unwind_info {
 	struct unwind_entry	entries[];
 };
 
+void libdw__invalidate_dwfl(struct maps *maps, void *dwfl);
+#endif
+
 #endif /* __PERF_UNWIND_LIBDW_H */
-- 
2.52.0.457.g6b5491de43-goog
Re: [PATCH v1 22/23] perf unwind-libdw: Don't discard loaded ELF/Dwarf after every unwind
Posted by Serhei Makarov 1 week, 5 days ago
> As removing the dwfl didn't prove possible, an alternative is to just not discard the dwfl when the unwind ends. The dwfl is valid for a process unless a dso is loaded at the same address as a previous one. So keep the dwfl with the maps, invalidate it if a map is removed (in case a new map replaces it) and recycle the dwfl in the unwinding code.

Relevant note:

I'm looking at whether elfutils libdwfl_stacktrace might further help with these issues. The libdwfl_stacktrace library is currently shipped as an experimental addon to libdwfl in elfutils 0.194, but I'm intending to stabilize the API this year. There's code for some analogous functions: translating perf->dwarf register files, and caching Dwfl and Elf data to speed up unwinding across multiple processes. (Thus, if unwinding a collection of perf_events from 16 processes that use libc, we don't need to load the CFI for libc.so 16 times.)

I think once I've stabilized the libdwfl_stacktrace API and expanded the register support to a larger set of perf-supported architectures, it makes sense for me to author a corresponding set of patches for perf. I'll see if it's worth adding more libdwflst functionality to elfutils 0.195 in anticipation of this.

-- Serhei Makarov
Re: [PATCH v1 22/23] perf unwind-libdw: Don't discard loaded ELF/Dwarf after every unwind
Posted by Ian Rogers 1 week, 5 days ago
On Tue, Jan 27, 2026 at 9:43 AM Serhei Makarov <serhei@serhei.io> wrote:
>
> > As removing the dwfl didn't prove possible, an alternative is to just not discard the dwfl when the unwind ends. The dwfl is valid for a process unless a dso is loaded at the same address as a previous one. So keep the dwfl with the maps, invalidate it if a map is removed (in case a new map replaces it) and recycle the dwfl in the unwinding code.
>
> Relevant note:
>
> I'm looking at whether elfutils libdwfl_stacktrace might further help with these issues. The libdwfl_stacktrace library is currently shipped as an experimental addon to libdwfl in elfutils 0.194, but I'm intending to stabilize the API this year. There's code for some analogous functions: translating perf->dwarf register files, and caching Dwfl and Elf data to speed up unwinding across multiple processes. (Thus, if unwinding a collection of perf_events from 16 processes that use libc, we don't need to load the CFI for libc.so 16 times.)
>
> I think once I've stabilized the libdwfl_stacktrace API and expanded the register support to a larger set of perf-supported architectures, it makes sense for me to author a corresponding set of patches for perf. I'll see if it's worth adding more libdwflst functionality to elfutils 0.195 in anticipation of this.

Thanks Serhei, this would be really awesome! I've done my best wrt
trying to optimize around libdwfl, but I wasn't able to have a single
dwfl for the addr2line case with a dso and the stack unwinding code.
Just to give too many details :-)

With addr2line we have the dso and an offset into the dso of some
point in code. We want to get the symbol for that address and inline
information. The "bias" notion in dwfl was something of a speed bump
in writing that code and was broken in an early version:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/libdw.c?h=perf-tools-next#n133
Just documenting libdw/libdwfl addresses better to say whether a bias
is or isn't necessary would be useful. Walking the inline information
is also (strangely to me) done outside of libdw, which makes me wonder
about support for things like partial compilation units:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/dwarf-aux.c?h=perf-tools-next#n147

For the unwinding:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/unwind-libdw.c?h=perf-tools-next#n407
I experimented with failing each unwind step so that then I could take
the return PC, make it an offset into a dso and reuse the dso dwfl.
This breaks many asserts in libdwfl and so I gave it up.

There's also the use of a dwfl for type profiling and probes in debuginfo:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/debuginfo.c?h=perf-tools-next#n55

For making the code generic we needed mapping functions between perf
and dwarf numbering, as well as number and string mapping. A problem
with the generic code was that if a register was encoded that libdwfl
didn't support it broke libdwfl. This led to clipping the registers:
https://web.git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/dwarf-regs.c?h=perf-tools-next#n157
but this feels like code that's going to bitrot, cause elfutil version
problems, etc.

Something I noticed is that there was a reduction in crashes from
0.192 to 0.194. I'm a little concerned that people may package perf
without updating elfutils and witness crashes because of these recent
perf changes. Hopefully distributions will move forward to 0.194 asap.

Thanks,
Ian

> -- Serhei Makarov