From: Josh Poimboeuf <jpoimboe@kernel.org>
Add an interface for scheduling task work to unwind the user space stack
before returning to user space. This solves several problems for its
callers:
- Ensure the unwind happens in task context even if the caller may be
running in interrupt context.
- Avoid duplicate unwinds, whether called multiple times by the same
caller or by different callers.
- Take a timestamp when the first request comes in since the task
entered the kernel. This will be returned to the calling function
along with the stack trace when the task leaves the kernel. This
timestamp can be used to correlate kernel unwinds/traces with the user
unwind. For this to work properly, the architecture must have a
local_clock() resolution that guarantees a different timestamp per
a task systemcall.
The timestamp is created to detect when the stacktrace is the same. It is
generated the first time a user space stacktrace is requested after the
task enters the kernel.
The timestamp is passed to the caller on request, and when the stacktrace is
generated upon returning to user space, it will call the requester's callback
with the timestamp as well as the stacktrace. The timestamp is cleared
when it goes back to user space. Note, this currently adds another
conditional to the unwind_reset_info() path that is always called
returning to user space, but future changes will put this back to a single
conditional.
A global list is created and protected by a global mutex that holds
tracers that register with the unwind infrastructure. The number of
registered tracers will be limited in future changes. Each perf program or
ftrace instance will register its own descriptor to use for deferred
unwind stack traces.
Note, in the function unwind_deferred_task_work() that gets called when
returning to user space, it uses a global mutex for synchronization which
will cause a big bottleneck. This will be replaced by SRCU, but that
change adds some complex synchronization that deservers its own commit.
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v11: https://lore.kernel.org/20250625225715.825831885@goodmis.org
- Still need to clear cache->nr_entries in unwind_reset_info even if
timestamp is zero. This is because unwind_user_faultable() can be called
directly and it requires nr_entries to be zeroed but it does not touch
the timestamp.
include/linux/unwind_deferred.h | 24 +++++
include/linux/unwind_deferred_types.h | 3 +
kernel/unwind/deferred.c | 139 +++++++++++++++++++++++++-
3 files changed, 165 insertions(+), 1 deletion(-)
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index baacf4a1eb4c..c6548e8d64d1 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -2,9 +2,19 @@
#ifndef _LINUX_UNWIND_USER_DEFERRED_H
#define _LINUX_UNWIND_USER_DEFERRED_H
+#include <linux/task_work.h>
#include <linux/unwind_user.h>
#include <linux/unwind_deferred_types.h>
+struct unwind_work;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 timestamp);
+
+struct unwind_work {
+ struct list_head list;
+ unwind_callback_t func;
+};
+
#ifdef CONFIG_UNWIND_USER
void unwind_task_init(struct task_struct *task);
@@ -12,8 +22,19 @@ void unwind_task_free(struct task_struct *task);
int unwind_user_faultable(struct unwind_stacktrace *trace);
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
+int unwind_deferred_request(struct unwind_work *work, u64 *timestamp);
+void unwind_deferred_cancel(struct unwind_work *work);
+
static __always_inline void unwind_reset_info(void)
{
+ if (unlikely(current->unwind_info.timestamp))
+ current->unwind_info.timestamp = 0;
+ /*
+ * As unwind_user_faultable() can be called directly and
+ * depends on nr_entries being cleared on exit to user,
+ * this needs to be a separate conditional.
+ */
if (unlikely(current->unwind_info.cache))
current->unwind_info.cache->nr_entries = 0;
}
@@ -24,6 +45,9 @@ static inline void unwind_task_init(struct task_struct *task) {}
static inline void unwind_task_free(struct task_struct *task) {}
static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; }
+static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
+static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
+static inline void unwind_deferred_cancel(struct unwind_work *work) {}
static inline void unwind_reset_info(void) {}
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index db5b54b18828..5df264cf81ad 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -9,6 +9,9 @@ struct unwind_cache {
struct unwind_task_info {
struct unwind_cache *cache;
+ struct callback_head work;
+ u64 timestamp;
+ int pending;
};
#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index 96368a5aa522..d5f2c004a5b0 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -2,16 +2,43 @@
/*
* Deferred user space unwinding
*/
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_deferred.h>
+#include <linux/sched/clock.h>
+#include <linux/task_work.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sizes.h>
#include <linux/slab.h>
-#include <linux/unwind_deferred.h>
+#include <linux/mm.h>
/* Make the cache fit in a 4K page */
#define UNWIND_MAX_ENTRIES \
((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
+/* Guards adding to and reading the list of callbacks */
+static DEFINE_MUTEX(callback_mutex);
+static LIST_HEAD(callbacks);
+
+/*
+ * Read the task context timestamp, if this is the first caller then
+ * it will set the timestamp.
+ *
+ * For this to work properly, the timestamp (local_clock()) must
+ * have a resolution that will guarantee a different timestamp
+ * everytime a task makes a system call. That is, two short
+ * system calls back to back must have a different timestamp.
+ */
+static u64 get_timestamp(struct unwind_task_info *info)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (!info->timestamp)
+ info->timestamp = local_clock();
+
+ return info->timestamp;
+}
+
/**
* unwind_user_faultable - Produce a user stacktrace in faultable context
* @trace: The descriptor that will store the user stacktrace
@@ -62,11 +89,120 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
return 0;
}
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+ struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+ struct unwind_stacktrace trace;
+ struct unwind_work *work;
+ u64 timestamp;
+
+ if (WARN_ON_ONCE(!info->pending))
+ return;
+
+ /* Allow work to come in again */
+ WRITE_ONCE(info->pending, 0);
+
+ /*
+ * From here on out, the callback must always be called, even if it's
+ * just an empty trace.
+ */
+ trace.nr = 0;
+ trace.entries = NULL;
+
+ unwind_user_faultable(&trace);
+
+ timestamp = info->timestamp;
+
+ guard(mutex)(&callback_mutex);
+ list_for_each_entry(work, &callbacks, list) {
+ work->func(work, &trace, timestamp);
+ }
+}
+
+/**
+ * unwind_deferred_request - Request a user stacktrace on task exit
+ * @work: Unwind descriptor requesting the trace
+ * @timestamp: The time stamp of the first request made for this task
+ *
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned @timestamp output is the timestamp of the very first request
+ * for a user space stacktrace for this task since it entered the kernel.
+ * It can be from a request by any caller of this infrastructure.
+ * Its value will also be passed to the callback function. It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * Note, the architecture must have a local_clock() implementation that
+ * guarantees a different timestamp per task systemcall.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context. Each call will return the same timestamp
+ * while the task hasn't left the kernel. If the callback is not pending because
+ * it has already been previously called for the same entry context, it will be
+ * called again with the same stack trace and timestamp.
+ *
+ * Return: 1 if the the callback was already queued.
+ * 0 if the callback successfully was queued.
+ * Negative if there's an error.
+ * @timestamp holds the timestamp of the first request by any user
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *timestamp)
+{
+ struct unwind_task_info *info = ¤t->unwind_info;
+ int ret;
+
+ *timestamp = 0;
+
+ if (WARN_ON_ONCE(in_nmi()))
+ return -EINVAL;
+
+ if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
+ !user_mode(task_pt_regs(current)))
+ return -EINVAL;
+
+ guard(irqsave)();
+
+ *timestamp = get_timestamp(info);
+
+ /* callback already pending? */
+ if (info->pending)
+ return 1;
+
+ /* The work has been claimed, now schedule it. */
+ ret = task_work_add(current, &info->work, TWA_RESUME);
+ if (WARN_ON_ONCE(ret))
+ return ret;
+
+ info->pending = 1;
+ return 0;
+}
+
+void unwind_deferred_cancel(struct unwind_work *work)
+{
+ if (!work)
+ return;
+
+ guard(mutex)(&callback_mutex);
+ list_del(&work->list);
+}
+
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+ memset(work, 0, sizeof(*work));
+
+ guard(mutex)(&callback_mutex);
+ list_add(&work->list, &callbacks);
+ work->func = func;
+ return 0;
+}
+
void unwind_task_init(struct task_struct *task)
{
struct unwind_task_info *info = &task->unwind_info;
memset(info, 0, sizeof(*info));
+ init_task_work(&info->work, unwind_deferred_task_work);
}
void unwind_task_free(struct task_struct *task)
@@ -74,4 +210,5 @@ void unwind_task_free(struct task_struct *task)
struct unwind_task_info *info = &task->unwind_info;
kfree(info->cache);
+ task_work_cancel(task, &info->work);
}
--
2.47.2
On Mon, Jun 30, 2025 at 08:53:27PM -0400, Steven Rostedt wrote: > +/* > + * Read the task context timestamp, if this is the first caller then > + * it will set the timestamp. > + * > + * For this to work properly, the timestamp (local_clock()) must > + * have a resolution that will guarantee a different timestamp > + * everytime a task makes a system call. That is, two short > + * system calls back to back must have a different timestamp. > + */ > +static u64 get_timestamp(struct unwind_task_info *info) > +{ > + lockdep_assert_irqs_disabled(); > + > + if (!info->timestamp) > + info->timestamp = local_clock(); > + > + return info->timestamp; > +} I'm very hesitant about this. Modern hardware can do this, but older hardware (think Intel Core and AMD Bulldozer etc hardware) might struggle with this. They don't have stable TSC and as such will use the magic in kernel/sched/clock.c; which can get stuck on a window edge for a little bit and re-use timestamps.
On Wed, 2 Jul 2025 18:36:09 +0200 Peter Zijlstra <peterz@infradead.org> wrote: > > +static u64 get_timestamp(struct unwind_task_info *info) > > +{ > > + lockdep_assert_irqs_disabled(); > > + > > + if (!info->timestamp) > > + info->timestamp = local_clock(); > > + > > + return info->timestamp; > > +} > > I'm very hesitant about this. Modern hardware can do this, but older > hardware (think Intel Core and AMD Bulldozer etc hardware) might > struggle with this. They don't have stable TSC and as such will use > the magic in kernel/sched/clock.c; which can get stuck on a window > edge for a little bit and re-use timestamps. Well, the idea of using timestamps came from Microsoft as that's what they do for a similar feature. It just seemed like an easier approach. But I could definitely go back to the "cookie" idea that is just a per cpu counter (with the CPU number as part of the cookie). As the timestamp is likely not going to be as useful as it is with Microsoft as there's no guarantee that the timestamp counter used is the same as the timestamp used by the tracer asking for this, the cookie approach may indeed be better. -- Steve
On Wed, 2 Jul 2025 at 09:42, Steven Rostedt <rostedt@goodmis.org> wrote: > > As the timestamp is likely not going to be as useful as it is with > Microsoft as there's no guarantee that the timestamp counter used is > the same as the timestamp used by the tracer asking for this, the cookie > approach may indeed be better. I think having just a percpu counter is probably the safest thing to do, since the main reason just seems to be "correlate with the user event". Using some kind of "real time" for correlation purposes seems like a bad idea from any portability standpoint, considering just how many broken timers we've seen across pretty much every architecture out there. Also, does it actually have to be entirely unique? IOW, a 32-bit counter (or even less) might be sufficient if there's some guarantee that processing happens before the counter wraps around? Again - for correlation purposes, just *how* many outstanding events can you have that aren't ordered by other things too? I'm sure people want to also get some kind of rough time idea, but don't most perf events have them simply because people want time information for _informatioal_ reasons, rather than to correlate two events? Linus
On Wed, 2 Jul 2025 09:56:39 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > Also, does it actually have to be entirely unique? IOW, a 32-bit > counter (or even less) might be sufficient if there's some guarantee > that processing happens before the counter wraps around? Again - for > correlation purposes, just *how* many outstanding events can you have > that aren't ordered by other things too? > > I'm sure people want to also get some kind of rough time idea, but > don't most perf events have them simply because people want time > information for _informatioal_ reasons, rather than to correlate two > events? And it only needs to be unique per thread per system call. The real reason for this identifier is for lost events. As I explained in the perf patchset, the issues is this: In case of dropped events, we could have the case of: system_call() { <nmi> { take kernel stack trace ask for deferred trace. [EVENTS START DROPPING HERE] } Call deferred callback to record trace [ BUT IS DROPPED ] } system_call() { <nmi> { take kernel stack trace ask for deferred trace [ STILL DROPPING ] } [ READER CATCHES UP AND STARTS READING EVENTS AGAIN] Call deferred callback to record trace } The user space tool will see that kernel stack traces of the first system call, then it will see events dropped, and then it will see the deferred user space stack trace of the second call. The identifier is only there for uniqueness for that one thread to let the tracer know if the deferred trace can be tied to events before it lost them. We figured a single 32 bit counter would be good enough when we first discussed this idea, but we wanted per cpu counters to not have cache contention every time a CPU wanted to increment the counter. But each CPU would need an identifier so that a task migrating will not get the same identifier for a different system call just because it migrated. We used 16 bits for the CPU counter thinking that 32K of CPUs would last some time in the future. We then chose to use a 64 bit number to allow us to have 48 bits left for uniqueness which is plenty. If we use 32 bits, that would give us 32K of unique systemcalls, and it does seem possible that on a busy system, a tracer could lose 32K of system calls before it gets going again. But we could still use it anyway as the likelihood of losing exactly 32K of system calls and starting tracing back up again will probably never happen. And if it does, the worse thing that it will do is have the tracer mistake which user space stack trace goes to which event. If your are tracing that many events, this will likely be in the noise. So I'm fine with making this a 32 bit counter using 16 bits for the CPU and 16 bits for per thread uniqueness. -- Steve
On Wed, 2 Jul 2025 13:26:05 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > So I'm fine with making this a 32 bit counter using 16 bits for the CPU > and 16 bits for per thread uniqueness. To still be able to use a 32 bit cmpxchg (for racing with an NMI), we could break this number up into two 32 bit words. One with the CPU that it was created on, and one with the per_cpu counter: union unwind_task_id { struct { u32 cpu; u32 cnt; } u64 id; }; static DEFINE_PER_CPU(u32, unwind_ctx_ctr); static u64 get_cookie(struct unwind_task_info *info) { u32 cpu_cnt; u32 cnt; u32 old = 0; if (info->id.cpu) return info->id.id; cpu_cnt = __this_cpu_read(unwind_ctx_ctr); cpu_cnt += 2; cnt = cpu_cnt | 1; /* Always make non zero */ if (try_cmpxchg(&info->id.cnt, &old, cnt)) { /* Update the per cpu counter */ __this_cpu_write(unwind_ctx_ctr, cpu_cnt); } /* Interrupts are disabled, the CPU will always be same */ info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ return info->id.id; } When leaving the kernel it does: info->id.id = 0; -- Steve
On Wed, 2 Jul 2025 at 10:49, Steven Rostedt <rostedt@goodmis.org> wrote: > > To still be able to use a 32 bit cmpxchg (for racing with an NMI), we > could break this number up into two 32 bit words. One with the CPU that > it was created on, and one with the per_cpu counter: Do you actually even need a cpu number at all? If this is per-thread, maybe just a per-thread counter would be good? And you already *have* that per-thread thing, in 'current->unwind_info'. And the work is using task_work, so the worker callback is also per-thread. Also, is racing with NMI even a thing for the sequence number? I would expect that the only thing that NMI would race with is the 'pending' field, not the sequence number. IOW, I think the logic could be - check 'pending' non-atomically, just because it's cheap - do a try_cmpxchg() on pending to actually deal with nmi races Actually, there are no SMP issues, just instruction atomicity - so a 'local_try_cmpxchg() would be sufficient, but that's a 'long' not a 'u32' ;^( - now you are exclusive for that thread, you're done, no more need for any atomic counter or percpu things And then the next step is to just say "pending is the low bit of the id word" and having a separate 31-bit counter that gets incremented by "get_cookie()". So then you end up with something like // New name for 'get_timestamp()' get_current_cookie() { return current->unwind_info.cookie; } // New name for 'get_cookie()': // 31-bit counter by just leaving bit #0 alone get_new_cookie() { current->unwind_info.cookie += 2; } and then unwind_deferred_request() would do something like unwind_deferred_request() { int old, new; if (current->unwind_info.id) return 1; guard(irqsave)(); // For NMI, if we race with 'get_new_cookie()' // we don't care if we get the old or the new one old = 0; new = get_current_cookie() | 1; if (!try_cmpxchg(¤t->unwind_info.id, &old, new)) return 1; .. now schedule the thing with that cookie set. Hmm? But I didn't actually look at the users, so maybe I'm missing some reason why you want to have a separate per-cpu value. Or maybe I missed something else entirely, and the above is complete garbage and the ramblings of a insane mind. It happens. Off to get more coffee. Linus
On Wed, 2 Jul 2025 11:21:34 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Wed, 2 Jul 2025 at 10:49, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > To still be able to use a 32 bit cmpxchg (for racing with an NMI), we > > could break this number up into two 32 bit words. One with the CPU that > > it was created on, and one with the per_cpu counter: > > Do you actually even need a cpu number at all? > > If this is per-thread, maybe just a per-thread counter would be good? > And you already *have* that per-thread thing, in > 'current->unwind_info'. I just hate to increase the task struct even more. I'm trying to keep only the data I can't put elsewhere in the task struct. > > And the work is using task_work, so the worker callback is also per-thread. > > Also, is racing with NMI even a thing for the sequence number? I would > expect that the only thing that NMI would race with is the 'pending' > field, not the sequence number. The sequence number is updated on the first time it's called after a task enters the kernel. Then it should return the same number every time after that. That's because this unique number is the identifier for the user space stack trace which doesn't change while the task is in the kernel, hence the id getting updated every time the task enters the kernel and not after that. > > IOW, I think the logic could be > > - check 'pending' non-atomically, just because it's cheap Note, later patches move the "pending" bit into a mask that is used to figure out what callbacks to call. > > - do a try_cmpxchg() on pending to actually deal with nmi races > > Actually, there are no SMP issues, just instruction atomicity - so a > 'local_try_cmpxchg() would be sufficient, but that's a 'long' not a > 'u32' ;^( Yeah, later patches do try to use more local_try_cmpxchg() at different parts. Even for the timestamp. > > - now you are exclusive for that thread, you're done, no more need > for any atomic counter or percpu things The trick and race with NMIs is, this needs to return that cookie for both callers, where the cookie is the same number. > > And then the next step is to just say "pending is the low bit of the > id word" and having a separate 31-bit counter that gets incremented by > "get_cookie()". > > So then you end up with something like > > // New name for 'get_timestamp()' > get_current_cookie() { return current->unwind_info.cookie; } > // New name for 'get_cookie()': > // 31-bit counter by just leaving bit #0 alone > get_new_cookie() { current->unwind_info.cookie += 2; } > > and then unwind_deferred_request() would do something like > > unwind_deferred_request() > { > int old, new; > > if (current->unwind_info.id) > return 1; > > guard(irqsave)(); > // For NMI, if we race with 'get_new_cookie()' > // we don't care if we get the old or the new one > old = 0; new = get_current_cookie() | 1; > if (!try_cmpxchg(¤t->unwind_info.id, &old, new)) > return 1; > .. now schedule the thing with that cookie set. > > Hmm? > > But I didn't actually look at the users, so maybe I'm missing some > reason why you want to have a separate per-cpu value. Now we could just have the counter be the 32 bit cookie (old timestamp) in the task struct, and have bit 0 be if it is valued or not. static u32 get_cookie(struct unwind_task_info *info) { u32 cnt = READ_ONCE(info->id); u32 new_cnt; if (cnt & 1) return cnt; new_cnt = cnt + 3; if (try_cmpxchg(&info->id, &cnt, new_cnt)) return new_cnt; return cnt; // return info->id; would work too } Then when going back to user space: info->id &= ~1; Now the counter is stored in the task struct and no other info is needed. > > Or maybe I missed something else entirely, and the above is complete > garbage and the ramblings of a insane mind. > > It happens. > > Off to get more coffee. Enjoy, I just grabbed my last cup of the day. -- Steve
On 2025-07-02 14:21, Linus Torvalds wrote: > On Wed, 2 Jul 2025 at 10:49, Steven Rostedt <rostedt@goodmis.org> wrote: >> >> To still be able to use a 32 bit cmpxchg (for racing with an NMI), we >> could break this number up into two 32 bit words. One with the CPU that >> it was created on, and one with the per_cpu counter: > > Do you actually even need a cpu number at all? > > If this is per-thread, maybe just a per-thread counter would be good? > And you already *have* that per-thread thing, in > 'current->unwind_info'. AFAIR, one of the goals here is to save the cookie into the trace to allow trace post-processing to link the event triggering the unwinding with the deferred unwinding data. In order to make the trace analysis results reliable, we'd like to avoid the following causes of uncertainty, which would mistakenly cause the post-processing analysis to associate a stack trace with the wrong event: - Thread ID re-use (exit + clone/fork), - Thread migration, - Events discarded (e.g. buffer full) causing missing thread lifetime events or missing unwind-related events. Unless I'm missing something, the per-thread counter would have issues with thread ID re-use during the trace lifetime. One possibility to solve this would be to introduce a thread identifier (e.g. 64-bit thread ID value) which is unique across the entire kernel lifetime. This approach would actually be useful for other use-cases as well, but a 64-bit ID is not as compact as the CPU number, so it is somewhat wasteful in terms of trace bandwidth. Hence the alternative we came up with, which is to combine the CPU number and a per-CPU counter to have a cheap way to keep track of a globally unique counter using per-CPU partitioning. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Wed, 2 Jul 2025 14:47:10 -0400 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > > AFAIR, one of the goals here is to save the cookie into the trace > to allow trace post-processing to link the event triggering the > unwinding with the deferred unwinding data. > > In order to make the trace analysis results reliable, we'd like > to avoid the following causes of uncertainty, which would > mistakenly cause the post-processing analysis to associate > a stack trace with the wrong event: > > - Thread ID re-use (exit + clone/fork), > - Thread migration, > - Events discarded (e.g. buffer full) causing missing > thread lifetime events or missing unwind-related events. > > Unless I'm missing something, the per-thread counter would have > issues with thread ID re-use during the trace lifetime. But you are missing one more thing that the trace can use, and that's the time sequence. As soon as the same thread has a new id you can assume all the older user space traces are not applicable for any new events for that thread, or any other thread with the same thread ID. Thus the only issue that can truly be a problem is if you have missed events where thread id wraps around. I guess that could be possible if a long running task finally exits and it's thread id is reused immediately. Is that a common occurrence? -- Steve.
On 2025-07-02 15:05, Steven Rostedt wrote: > On Wed, 2 Jul 2025 14:47:10 -0400 > Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: >> >> AFAIR, one of the goals here is to save the cookie into the trace >> to allow trace post-processing to link the event triggering the >> unwinding with the deferred unwinding data. >> >> In order to make the trace analysis results reliable, we'd like >> to avoid the following causes of uncertainty, which would >> mistakenly cause the post-processing analysis to associate >> a stack trace with the wrong event: >> >> - Thread ID re-use (exit + clone/fork), >> - Thread migration, >> - Events discarded (e.g. buffer full) causing missing >> thread lifetime events or missing unwind-related events. >> >> Unless I'm missing something, the per-thread counter would have >> issues with thread ID re-use during the trace lifetime. > > But you are missing one more thing that the trace can use, and that's > the time sequence. As soon as the same thread has a new id you can > assume all the older user space traces are not applicable for any new > events for that thread, or any other thread with the same thread ID. In order for the scheme you describe to work, you need: - instrumentation of task lifetime (exit/fork+clone), - be sure that the events related to that instrumentation were not dropped. I'm not sure about ftrace, but in LTTng enabling instrumentation of task lifetime is entirely up to the user. And even if it's enabled, events can be discarded (e.g. buffer full). > > Thus the only issue that can truly be a problem is if you have missed > events where thread id wraps around. I guess that could be possible if > a long running task finally exits and it's thread id is reused > immediately. Is that a common occurrence? You just need a combination of thread ID re-use and either no instrumentation of task lifetime or events discarded to trigger this. Even if it's not so frequent, at large scale and in production, I suspect that this will happen quite often. You don't even need the task IDs to be re-used very quickly for this to be an issue. Thanks, Mathieu > > -- Steve. > -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Wed, 2 Jul 2025 15:12:45 -0400 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > > But you are missing one more thing that the trace can use, and that's > > the time sequence. As soon as the same thread has a new id you can > > assume all the older user space traces are not applicable for any new > > events for that thread, or any other thread with the same thread ID. > > In order for the scheme you describe to work, you need: > > - instrumentation of task lifetime (exit/fork+clone), > - be sure that the events related to that instrumentation were not > dropped. > > I'm not sure about ftrace, but in LTTng enabling instrumentation of > task lifetime is entirely up to the user. Has nothing to do with task lifetime. If you see a deferred request with id of 1 from task 8888, and then later you see either a deferred request or a stack trace with an id other than 1 for task 8888, you can then say all events before now are no longer eligible for new deferred stack traces. > > And even if it's enabled, events can be discarded (e.g. buffer full). The only case is if you see a deferred request with id 1 for task 8888, then you start dropping all events and that task 8888 exits and a new one appears with task id 8888 where it too has a deferred request with id 1 then you start picking up events again and see a deferred stack trace for the new task 8888 where it's id is 1, you lose. But other than that exact scenario, it should not get confused. > > > > > Thus the only issue that can truly be a problem is if you have missed > > events where thread id wraps around. I guess that could be possible if > > a long running task finally exits and it's thread id is reused > > immediately. Is that a common occurrence? > > You just need a combination of thread ID re-use and either no > instrumentation of task lifetime or events discarded to trigger this. Again, it's seeing a new request with another id for the same task, you don't need to worry about it. You don't even need to look at fork and exit events. > Even if it's not so frequent, at large scale and in production, I > suspect that this will happen quite often. Really? As I explained above? -- Steve
On 2025-07-02 15:21, Steven Rostedt wrote: > On Wed, 2 Jul 2025 15:12:45 -0400 > Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > >>> But you are missing one more thing that the trace can use, and that's >>> the time sequence. As soon as the same thread has a new id you can >>> assume all the older user space traces are not applicable for any new >>> events for that thread, or any other thread with the same thread ID. >> >> In order for the scheme you describe to work, you need: >> >> - instrumentation of task lifetime (exit/fork+clone), >> - be sure that the events related to that instrumentation were not >> dropped. >> >> I'm not sure about ftrace, but in LTTng enabling instrumentation of >> task lifetime is entirely up to the user. > > Has nothing to do with task lifetime. If you see a deferred request > with id of 1 from task 8888, and then later you see either a deferred > request or a stack trace with an id other than 1 for task 8888, you can > then say all events before now are no longer eligible for new deferred > stack traces. > >> >> And even if it's enabled, events can be discarded (e.g. buffer full). > > The only case is if you see a deferred request with id 1 for task 8888, > then you start dropping all events and that task 8888 exits and a new > one appears with task id 8888 where it too has a deferred request with > id 1 then you start picking up events again and see a deferred stack > trace for the new task 8888 where it's id is 1, you lose. > > But other than that exact scenario, it should not get confused. Correct. > >> >>> >>> Thus the only issue that can truly be a problem is if you have missed >>> events where thread id wraps around. I guess that could be possible if >>> a long running task finally exits and it's thread id is reused >>> immediately. Is that a common occurrence? >> >> You just need a combination of thread ID re-use and either no >> instrumentation of task lifetime or events discarded to trigger this. > > Again, it's seeing a new request with another id for the same task, you > don't need to worry about it. You don't even need to look at fork and > exit events. The reason why instrumentation of exit/{fork,clone} is useful is to know when a thread ID is re-used. > >> Even if it's not so frequent, at large scale and in production, I >> suspect that this will happen quite often. > > Really? As I explained above? Note that all newly forked threads will likely start counting near 0. So chances are that for short-lived threads most of the counter values will be in a low range. So all you need is thread ID re-use for two threads which happen to use the deferred cookies within low-value ranges to hit this. From my perspective, making trace analysis results reliable is the most basic guarantee tooling should provide in order to make it trusted by users. So I am tempted to err towards robustness rather than take shortcuts because "it does not happen often". Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Wed, 2 Jul 2025 15:43:08 -0400 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > From my perspective, making trace analysis results reliable is the most > basic guarantee tooling should provide in order to make it trusted by > users. So I am tempted to err towards robustness rather than take > shortcuts because "it does not happen often". Another solution which I'm thinking of doing for perf is to simply state: a deferred stack trace does not go to any event before events were dropped. That is, even if the stack trace is associated, if events are dropped before getting out of the kernel, just say, "Sorry, the events before the dropped events lost its user stack trace". That may better, as having no data is better than incorrect data. -- Steve
On Wed, 2 Jul 2025 15:21:11 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > The only case is if you see a deferred request with id 1 for task 8888, > then you start dropping all events and that task 8888 exits and a new > one appears with task id 8888 where it too has a deferred request with > id 1 then you start picking up events again and see a deferred stack > trace for the new task 8888 where it's id is 1, you lose. And if we want to fix that, we could make the cookie 64 bit again, and set the timestamp on the first time it is used for the trace. union unwind_task_id { struct { u32 task_id; u32 cnt; } u64 id; }; static u64 get_cookie(struct unwind_task_info *info) { u32 cnt = READ_ONCE(info->id.cnt); u32 new_cnt; if (cnt & 1) return info->id; if (unlikely(!info->id.task_id)) { u32 task_id = local_clock(); cnt = 0; if (try_cmpxchg(&info->id.task_id, &cnt, task_id)) task_id = cnt; } new_cnt = cnt + 3; if (try_cmpxchg(&info->id, &cnt, new_cnt)) new_cnt = cnt; // try_cmpxchg() expects something return info->id; } So now each task will have its own id and even if we have a task wrap around, the cookie will never be the same, as fork sets the info->id to zero. Yes, the local_clock() can wrap around, but now making all those the same to cause an issue is extremely unlikely, and still, if it happens, the worse thing that it causes is that the user space stack trace will be associated to the wrong events. -- Steve
On Wed, 2 Jul 2025 15:36:00 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > union unwind_task_id { > struct { > u32 task_id; > u32 cnt; > } > u64 id; > }; > > static u64 get_cookie(struct unwind_task_info *info) > { > u32 cnt = READ_ONCE(info->id.cnt); > u32 new_cnt; > > if (cnt & 1) > return info->id; > > if (unlikely(!info->id.task_id)) { > u32 task_id = local_clock(); > > cnt = 0; > if (try_cmpxchg(&info->id.task_id, &cnt, task_id)) > task_id = cnt; > } > > new_cnt = cnt + 3; > if (try_cmpxchg(&info->id, &cnt, new_cnt)) > new_cnt = cnt; // try_cmpxchg() expects something > > return info->id; > } Honestly I think this is way overkill. What I would do, is to have the cookie saved in the event be 64 bit, but we can start with the simple 32 bit solution keeping the top 32 bits zeros. If this does indeed become an issue in the future, we could fix it with a 64 bit number. By making sure all the exposed "cookies" are 64 bit, it should not break anything. The cookie is just supposed to be a random unique number that associates a request with its deferred user space stack trace. With any exposed cookies to user space being 64 bits, this should not be an issue to address in the future. -- Steve
On 2025-07-02 15:40, Steven Rostedt wrote: > On Wed, 2 Jul 2025 15:36:00 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > >> union unwind_task_id { >> struct { >> u32 task_id; >> u32 cnt; >> } >> u64 id; >> }; >> >> static u64 get_cookie(struct unwind_task_info *info) >> { >> u32 cnt = READ_ONCE(info->id.cnt); >> u32 new_cnt; >> >> if (cnt & 1) >> return info->id; >> >> if (unlikely(!info->id.task_id)) { >> u32 task_id = local_clock(); >> >> cnt = 0; >> if (try_cmpxchg(&info->id.task_id, &cnt, task_id)) >> task_id = cnt; >> } >> >> new_cnt = cnt + 3; >> if (try_cmpxchg(&info->id, &cnt, new_cnt)) >> new_cnt = cnt; // try_cmpxchg() expects something >> >> return info->id; >> } > > Honestly I think this is way overkill. What I would do, is to have the > cookie saved in the event be 64 bit, but we can start with the > simple 32 bit solution keeping the top 32 bits zeros. If this does > indeed become an issue in the future, we could fix it with a 64 bit > number. By making sure all the exposed "cookies" are 64 bit, it should > not break anything. The cookie is just supposed to be a random unique > number that associates a request with its deferred user space stack > trace. > > With any exposed cookies to user space being 64 bits, this should not > be an issue to address in the future. FWIW, I liked your idea of making the cookie 64-bit with: - 32-bit cpu number, - 32-bit per-CPU free running counter. This is simple, works on 32-bit archs, and it does not overflow as often as time LSB because it counts execution contexts. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Wed, 2 Jul 2025 15:48:16 -0400 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > FWIW, I liked your idea of making the cookie 64-bit with: > > - 32-bit cpu number, > - 32-bit per-CPU free running counter. > > This is simple, works on 32-bit archs, and it does not overflow as often > as time LSB because it counts execution contexts. I have no problem with implementing that. But Linus had his concerns. If he's OK with that version, then I'll go with that. -- Steve
© 2016 - 2025 Red Hat, Inc.