From: Steven Rostedt <rostedt@goodmis.org>
The deferred user space stacktrace event already does a lookup of the vma
for each address in the trace to get the file offset for those addresses,
it can also report the file itself.
Add two more arrays to the user space stacktrace event. One for the inode
number, and the other to store the device major:minor number. Now the
output looks like this:
trace-cmd-1108 [007] ..... 240.253487: <user stack unwind>
cookie=7000000000009
=> <00000000001001ca> : 1340007 : 254:3
=> <000000000000ae27> : 1308548 : 254:3
=> <00000000000107e7> : 1440347 : 254:3
=> <00000000000109d3> : 1440347 : 254:3
=> <0000000000011523> : 1440347 : 254:3
=> <000000000001f79d> : 1440347 : 254:3
=> <000000000001fb01> : 1440347 : 254:3
=> <000000000001fbc0> : 1440347 : 254:3
=> <000000000000eb7e> : 1440347 : 254:3
=> <0000000000029ca8> : 1340007 : 254:3
Use space tooling can use this information to get the actual functions
from the files.
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v5: https://lore.kernel.org/20250424192613.869730948@goodmis.org
- Set inode to -1L if vma is not found for that address to let user space
know that, and differentiate from a vdso section.
kernel/trace/trace.c | 26 +++++++++++++++++++++++++-
kernel/trace/trace_entries.h | 8 ++++++--
kernel/trace/trace_output.c | 27 +++++++++++++++++++++++++++
3 files changed, 58 insertions(+), 3 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3e9ef644dd64..c6e1471e4615 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3140,6 +3140,8 @@ static void trace_user_unwind_callback(struct unwind_work *unwind,
unsigned int trace_ctx;
struct vm_area_struct *vma = NULL;
unsigned long *caller;
+ unsigned long *inodes;
+ unsigned int *devs;
unsigned int offset;
int len;
int i;
@@ -3151,7 +3153,8 @@ static void trace_user_unwind_callback(struct unwind_work *unwind,
if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE_DELAY))
return;
- len = trace->nr * sizeof(unsigned long) + sizeof(*entry);
+ len = trace->nr * (sizeof(unsigned long) * 2 + sizeof(unsigned int))
+ + sizeof(*entry);
trace_ctx = tracing_gen_ctx();
@@ -3172,6 +3175,15 @@ static void trace_user_unwind_callback(struct unwind_work *unwind,
entry->__data_loc_stack = offset | (len << 16);
caller = (void *)entry + offset;
+ offset += len;
+ entry->__data_loc_inodes = offset | (len << 16);
+ inodes = (void *)entry + offset;
+
+ offset += len;
+ len = sizeof(unsigned int) * trace->nr;
+ entry->__data_loc_dev = offset | (len << 16);
+ devs = (void *)entry + offset;
+
for (i = 0; i < trace->nr; i++) {
unsigned long addr = trace->entries[i];
@@ -3180,9 +3192,21 @@ static void trace_user_unwind_callback(struct unwind_work *unwind,
if (!vma) {
caller[i] = addr;
+ /* Use -1 to denote no vma found */
+ inodes[i] = -1L;
+ devs[i] = 0;
continue;
}
+
caller[i] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
+
+ if (vma->vm_file && vma->vm_file->f_inode) {
+ inodes[i] = vma->vm_file->f_inode->i_ino;
+ devs[i] = vma->vm_file->f_inode->i_sb->s_dev;
+ } else {
+ inodes[i] = 0;
+ devs[i] = 0;
+ }
}
__buffer_unlock_commit(buffer, event);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 40dc53ead0a8..5f7b72359901 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -256,10 +256,14 @@ FTRACE_ENTRY(user_unwind_stack, userunwind_stack_entry,
F_STRUCT(
__field( u64, cookie )
__dynamic_array( unsigned long, stack )
+ __dynamic_array( unsigned long, inodes )
+ __dynamic_array( unsigned int, dev )
),
- F_printk("cookie=%lld\n%s", __entry->cookie,
- __print_dynamic_array(stack, sizeof(unsigned long)))
+ F_printk("cookie=%lld\n%s%s%s", __entry->cookie,
+ __print_dynamic_array(stack, sizeof(unsigned long)),
+ __print_dynamic_array(inodes, sizeof(unsigned long)),
+ __print_dynamic_array(dev, sizeof(unsigned long)))
);
FTRACE_ENTRY(user_unwind_cookie, userunwind_cookie_entry,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 9489537533f7..437e5f23b73d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1410,9 +1410,13 @@ static enum print_line_t trace_user_unwind_stack_print(struct trace_iterator *it
struct userunwind_stack_entry *field;
struct trace_seq *s = &iter->seq;
unsigned long *caller;
+ unsigned long *inodes;
+ unsigned int *devs;
unsigned int offset;
unsigned int len;
unsigned int caller_cnt;
+ unsigned int inode_cnt;
+ unsigned int dev_cnt;
unsigned int i;
trace_assign_type(field, iter->ent);
@@ -1429,6 +1433,21 @@ static enum print_line_t trace_user_unwind_stack_print(struct trace_iterator *it
caller = (void *)iter->ent + offset;
+ /* The inodes and devices are also dynamic pointers */
+ offset = field->__data_loc_inodes;
+ len = offset >> 16;
+ offset = offset & 0xffff;
+ inode_cnt = len / sizeof(*inodes);
+
+ inodes = (void *)iter->ent + offset;
+
+ offset = field->__data_loc_dev;
+ len = offset >> 16;
+ offset = offset & 0xffff;
+ dev_cnt = len / sizeof(*devs);
+
+ devs = (void *)iter->ent + offset;
+
for (i = 0; i < caller_cnt; i++) {
unsigned long ip = caller[i];
@@ -1437,6 +1456,14 @@ static enum print_line_t trace_user_unwind_stack_print(struct trace_iterator *it
trace_seq_puts(s, " => ");
seq_print_user_ip(s, NULL, ip, flags);
+
+ if (i < inode_cnt) {
+ trace_seq_printf(s, " : %ld", inodes[i]);
+ if (i < dev_cnt) {
+ trace_seq_printf(s, " : %d:%d",
+ MAJOR(devs[i]), MINOR(devs[i]));
+ }
+ }
trace_seq_putc(s, '\n');
}
--
2.50.1
On Thu, 28 Aug 2025 at 11:05, Steven Rostedt <rostedt@kernel.org> wrote: > > The deferred user space stacktrace event already does a lookup of the vma > for each address in the trace to get the file offset for those addresses, > it can also report the file itself. That sounds like a good idea.. But the implementation absolutely sucks: > Add two more arrays to the user space stacktrace event. One for the inode > number, and the other to store the device major:minor number. Now the > output looks like this: WTF? Why are you back in the 1960's? What's next? The index into the paper card deck? Stop using inode numbers and device numbers already. It's the 21st century. No, cars still don't fly, but dammit, inode numbers were a great idea back in the days, but they are not acceptable any more. They *particularly* aren't acceptable when you apparently think that they are 'unsigned long'. Yes, that's the internal representation we use for inode indexing, but for example on nfs the inode is actually bigger. It's exposed to user space as a u64 through stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); so the inode that user space sees in 'struct stat' (a) doesn't actually match inode->i_ino, and (b) isn't even the full file ID that NFS actually uses. Let's not let that 60's thinking be any part of a new interface. Give the damn thing an actual filename or something *useful*, not a number that user space can't even necessarily match up to anything. Linus
On August 28, 2025 3:39:35 PM GMT-03:00, Linus Torvalds <torvalds@linux-foundation.org> wrote: >On Thu, 28 Aug 2025 at 11:05, Steven Rostedt <rostedt@kernel.org> wrote: >> >> The deferred user space stacktrace event already does a lookup of the vma >> for each address in the trace to get the file offset for those addresses, >> it can also report the file itself. > >That sounds like a good idea.. > >But the implementation absolutely sucks: > >> Add two more arrays to the user space stacktrace event. One for the inode >> number, and the other to store the device major:minor number. Now the >> output looks like this: > >WTF? Why are you back in the 1960's? What's next? The index into the >paper card deck? > >Stop using inode numbers and device numbers already. It's the 21st >century. No, cars still don't fly, but dammit, inode numbers were a >great idea back in the days, but they are not acceptable any more. > >They *particularly* aren't acceptable when you apparently think that >they are 'unsigned long'. Yes, that's the internal representation we >use for inode indexing, but for example on nfs the inode is actually >bigger. It's exposed to user space as a u64 through > > stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); > >so the inode that user space sees in 'struct stat' (a) doesn't >actually match inode->i_ino, and (b) isn't even the full file ID that >NFS actually uses. > >Let's not let that 60's thinking be any part of a new interface. > >Give the damn thing an actual filename or something *useful*, not a >number that user space can't even necessarily match up to anything. > A build ID? PERF_RECORD_MMAP went thru this, filename -> inode -> Content based hash - Arnaldo
On Thu, 28 Aug 2025 at 11:58, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: > > > >Give the damn thing an actual filename or something *useful*, not a > >number that user space can't even necessarily match up to anything. > > A build ID? I think that's a better thing than the disgusting inode number, yes. That said, I think they are problematic too, in that I don't think they are universally available, so if you want to trace some executable without build ids - and there are good reasons to do that - you might hate being limited that way. So I think you'd be much better off with just actual pathnames. Are there no trace events for "mmap this path"? Create a good u64 hash from the contents of a 'struct path' (which is just two pointers: the dentry and the mnt) when mmap'ing the file, and then you can just associate the stack trace entry with that hash. That should be simple and straightforward, and hashing two pointers should be simple and straightforward. And then matching that hash against the mmap event where the actual path was saved off gives you an actual *pathname*. Which is *so* much better than those horrific inode numbers. And yes, yes, obviously filenames can go away and aren't some kind of long-term stable thing. But inode numbers can be re-used too, so that's no different. With the "create a hash of 'struct path' contents" you basically have an ID that can be associated with whatever the file name was at the time it was mmap'ed into the thing you are tracing, which is I think what you really want anyway. Now, what would be even simpler is to not create a hash at all, but simply just create the whole pathname when the stack trace entry is created. But it would probably waste too much space, since you'd probably want to have at least 32 bytes (as opposed to just 64 bits) for a (truncated) pathname. And it would be more expensive than just hashing the dentry/mnt pointers, although '%pD' isn't actually *that* expensive. But probably expensive enough to not really be acceptable. I'm just throwing it out as a stupid idea that at least generates much more usable output than the inode numbers do. Linus
On Thu, 28 Aug 2025 12:18:39 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Thu, 28 Aug 2025 at 11:58, Arnaldo Carvalho de Melo > <arnaldo.melo@gmail.com> wrote: > > > > > >Give the damn thing an actual filename or something *useful*, not a > > >number that user space can't even necessarily match up to anything. > > > > A build ID? > > I think that's a better thing than the disgusting inode number, yes. I don't care what it is. I picked inode/device just because it was the only thing I saw available. I'm not sure build ID is appropriate either. > > That said, I think they are problematic too, in that I don't think > they are universally available, so if you want to trace some > executable without build ids - and there are good reasons to do that - > you might hate being limited that way. > > So I think you'd be much better off with just actual pathnames. As you mentioned below, the reason I avoided path names is that they take up too much of the ring buffer, and would be duplicated all over the place. I've run this for a while, and it only picked up a couple of hundred paths while the trace had several thousand stack traces. > > Are there no trace events for "mmap this path"? Create a good u64 hash > from the contents of a 'struct path' (which is just two pointers: the > dentry and the mnt) when mmap'ing the file, and then you can just > associate the stack trace entry with that hash. I would love to have a hash to use. The next patch does the mapping of the inode numbers to their path name. It can easily be switched over to do the same with a hash number. > > That should be simple and straightforward, and hashing two pointers > should be simple and straightforward. Would a hash of these pointers have any collisions? That would be bad. Hmm, I just tried using the pointer to vma->vm_file->f_inode, and that gives me a unique number. Then I just need to map that back to the path name: trace-cmd-1016 [002] ...1. 34.675646: inode_cache: inode=ffff8881007ed428 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libc.so.6 trace-cmd-1016 [002] ...1. 34.675893: inode_cache: inode=ffff88811970e648 dev=[254:3] path=/usr/local/lib64/libtracefs.so.1.8.2 trace-cmd-1016 [002] ...1. 34.675933: inode_cache: inode=ffff88811970b8f8 dev=[254:3] path=/usr/local/lib64/libtraceevent.so.1.8.4 trace-cmd-1016 [002] ...1. 34.675981: inode_cache: inode=ffff888110b78ba8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 bash-1007 [003] ...1. 34.677316: inode_cache: inode=ffff888103f05d38 dev=[254:3] path=/usr/bin/bash bash-1007 [003] ...1. 35.432951: inode_cache: inode=ffff888116be94b8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5 bash-1018 [005] ...1. 36.104543: inode_cache: inode=ffff8881007e9dc8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 bash-1018 [005] ...1. 36.110407: inode_cache: inode=ffff888110b78298 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1 bash-1018 [005] ...1. 36.110536: inode_cache: inode=ffff888103d09dc8 dev=[254:3] path=/usr/local/bin/trace-cmd I just swapped out the inode with the above (unsigned long)vma->vm_file->f_inode, and it appears to be unique. Thus, I could use that as the "hash" value and then the above could be turned into: trace-cmd-1016 [002] ...1. 34.675646: inode_cache: hash=ffff8881007ed428 path=/usr/lib/x86_64-linux-gnu/libc.so.6 trace-cmd-1016 [002] ...1. 34.675893: inode_cache: hash=ffff88811970e648 path=/usr/local/lib64/libtracefs.so.1.8.2 trace-cmd-1016 [002] ...1. 34.675933: inode_cache: hash=ffff88811970b8f8 path=/usr/local/lib64/libtraceevent.so.1.8.4 trace-cmd-1016 [002] ...1. 34.675981: inode_cache: hash=ffff888110b78ba8 path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 bash-1007 [003] ...1. 34.677316: inode_cache: hash=ffff888103f05d38 path=/usr/bin/bash bash-1007 [003] ...1. 35.432951: inode_cache: hash=ffff888116be94b8 path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5 bash-1018 [005] ...1. 36.104543: inode_cache: hash=ffff8881007e9dc8 path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 bash-1018 [005] ...1. 36.110407: inode_cache: hash=ffff888110b78298 path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1 bash-1018 [005] ...1. 36.110536: inode_cache: hash=ffff888103d09dc8 path=/usr/local/bin/trace-cmd This would mean the readers of the userstacktrace_delay need to also have this event enabled to do the mappings. But that shouldn't be an issue. -- Steve
On Thu, 28 Aug 2025 at 13:17, Steven Rostedt <rostedt@kernel.org> wrote: > > > > > That should be simple and straightforward, and hashing two pointers > > should be simple and straightforward. > > Would a hash of these pointers have any collisions? That would be bad. What? Collisions in 64 bits when you have a handful of cases around? Not an issue unless you picked your hash to be something ridiculous. Linus
On Thu, 28 Aug 2025 13:38:33 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Thu, 28 Aug 2025 at 13:17, Steven Rostedt <rostedt@kernel.org> wrote: > > > > > > > > That should be simple and straightforward, and hashing two pointers > > > should be simple and straightforward. > > > > Would a hash of these pointers have any collisions? That would be bad. > > What? Collisions in 64 bits when you have a handful of cases around? > Not an issue unless you picked your hash to be something ridiculous. > Since I only need a unique identifier, and it appears that the vma->vm_file->f_inode pointer is unique, would just using that be OK? I could run it through the same hash algorithm that "%p" goes through so that it's not a real memory address. As getting to the path does require some more logic to get to. Not to mention, this may later need to handle JIT code (and we'll need a way to map to that too). -- Steve
On Thu, 28 Aug 2025 at 13:48, Steven Rostedt <rostedt@kernel.org> wrote: > > I could run it through the same hash algorithm that "%p" goes through so > that it's not a real memory address. For '%p', people can't easily trigger lots of different cases, and you can't force kernel printouts from user space. For something like tracing, user space *does* control the output, and you shouldn't give people visibility into the hashing that '%p' does. So you can certainly use siphash for hashing, but make sure to not use the same secret key that the printing does. As to the ID to hash, I actually think a 'struct file *' might be the best thing to use - that's directly in the vma, no need to follow any other pointers for it. Linus
On Thu, 28 Aug 2025 14:06:39 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > So you can certainly use siphash for hashing, but make sure to not use > the same secret key that the printing does. Right, I just meant to use the same algorithm. The key would be different. > > As to the ID to hash, I actually think a 'struct file *' might be the > best thing to use - that's directly in the vma, no need to follow any > other pointers for it. But that's unique per task, right? What I liked about the f_inode pointer, is that it appears to be shared between tasks. I only want to add a new hash and print the path for a new file. If several tasks are using the same file (which they are with the libraries), then having the hash be the same between tasks would be more efficient. -- Steve
On Thu, 28 Aug 2025 at 14:17, Steven Rostedt <rostedt@kernel.org> wrote: > > But that's unique per task, right? What I liked about the f_inode > pointer, is that it appears to be shared between tasks. I actually think the local meaning of the file pointer is an advantage. It not only means that you see the difference in mappings of the same file created with different open calls, it also means that when different processes mmap the same executable, they don't see the same hash. And because the file pointer doesn't have any long-term meaning, it also means that you also can't make the mistake of thinking the hash has a long lifetime. With an inode pointer hash, you could easily have software bugs that end up not realizing that it's a temporary hash, and that the same inode *will* get two different hashes if the inode has been flushed from memory and then loaded anew due to memory pressure. > I only want to add a new hash and print the path for a new file. If > several tasks are using the same file (which they are with the > libraries), then having the hash be the same between tasks would be > more efficient. Why? See above why I think it's a mistake to think those hashes have lifetimes. They don't. Two different inodes can have the same hash due to lifetime issues, and the same inode can get two different hashes at different times for the same reason. So you *need* to tie these things to the only lifetime that matters: the open/close pair (and the mmap - and the stack traces - will be part of that lifetime). I literally think that you are not thinking about this right if you think you can re-use the hash. Linus
On Thu, 28 Aug 2025 15:10:52 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > And because the file pointer doesn't have any long-term meaning, it > also means that you also can't make the mistake of thinking the hash > has a long lifetime. With an inode pointer hash, you could easily have > software bugs that end up not realizing that it's a temporary hash, > and that the same inode *will* get two different hashes if the inode > has been flushed from memory and then loaded anew due to memory > pressure. The hash value can actually last longer than the file pointer. Thus, if we use the file pointer for the hash, then we could risk it getting freed and then allocated again for different file. Then we get the same hash value for two different paths. What I'm looking at doing is using both the file pointer as well as its path to make the hash: struct jhash_key { void *file; struct path path; }; u32 trace_file_cache_add(struct vm_area_struct *vma) { [..] static u32 initval; u32 hash; if (!vma->vm_file) return 0; if (!initval) get_random_bytes(&initval, sizeof(initval)); jkey.file = vma->vm_file; jkey.path = vma->vm_file->f_path; hash = jhash(&jkey, sizeof(jkey), initval); if (!trace_file_cache_enabled()) return hash; [ add the hash to the rhashtable and print if unique ] Hopefully by using both the file pointer and its path to create the hash, it will stay unique for some time. -- Steve
On Fri, 29 Aug 2025 at 08:06, Steven Rostedt <rostedt@goodmis.org> wrote: > > The hash value can actually last longer than the file pointer. Thus, if we > use the file pointer for the hash, then we could risk it getting freed and > then allocated again for different file. Then we get the same hash value > for two different paths. No, no no. That doesn't mean that the hash "lasts longer" than the file pointer. Quite the reverse. It is literally about the fact that YOU HAVE TO TAKE LIFETIMES INTO ACCOUNT. So you are being confused, and that shows in your "solution". And the thing is, the "you have to take lifetimes into account' is true *regardless* of what you use as your index. It was true even with inode numbers and major/minor numbers, in that file deletion and creation would basically end up reusing the same "hash". And this is my *point*: the advantage of the 'struct file *' is that it is a local thing that gets reused potentially quite quickly, and *forces* you to get the lifetime right. So don't mess that up. Except you do, and suggest this instead: > What I'm looking at doing is using both the file pointer as well as its > path to make the hash: NO NO NO. Now you are only saying "ok, I have a bogus lifetime, so I'll make a hash where that isn't obvious any more because reuse is harder to trigger". IOW: YOU ARE MAKING THE BUG WORSE BY HIDING IT. You're not fixing anything at all. You are literally making it obvious that your design is bogus and you're not thinking things through. So stop it. Really. Instead, realize that *ANY* hash you use has a limited lifetime, and the *ONLY* validity of that random number - whether it's a hash of the file pointer, an inode number, or anything else - is DURING THE MAPPING THAT IT USES. As long as the mapping exists, you know the thing is stable, because the mapping has a reference to the file (which has a reference to the path, which has a reference to the inode - so it all stays consistent and stable). But *immediately* when the mapping goes away, it's now no longer valid to think it has some meaning any more. Really. It might be a temporary file that was already unlinked, and the 'munmap()' is the last thing that releases the inode and it gets deleted from disk, and a new inode is created with the exact same inode number, and maybe even the exact same 'struct inode *' pointer. And as long as you don't understand this, you will always get this wrong, and you'll create bogus "workarounds" that just hide the REAL bug. Bogus workarounds like making a random different hash that is just less likely to show your mistake. In other words, to get this right, you *have* to associate the hash with the mmap creation that precedes it in the trace. You MUST NOT reuse it, not unless you also have some kind of reference count model that keeps track of how many mmap's that hash is associated with. Put another way: the only valid way to reuse it is if you manually track the lifetime of it. Anything else is WRONG. Now, tracking the actual lifetime of the hash is probably doable, but it's complex and error-prone. You can't do it by using the reference count in the 'struct file' itself, because that would keep the lifetime of the file artificially elevated, so you'd have to do some entirely separate thing that tracks things. Don't do it. Anyway, the way to fix this is to not care about lifetimes at all: just treat the hash as the random number it is, and just accept the fact that the number gets actively reused and has no meaning. Instead, just make sure that when you *use* the hash in user space, you always associate the hash with the previous trace event for the mmap that used that hash. You need to look up the event anyway to figure out what the hash means. And this is where the whole "short lifetime" is so important. It's what *forces* you to get this right, instead of doing the wrong thing and thinking that hashes have lifetimes that they simply do not have. The number in the stack trace - regardless of what that number is - is *ONLY* valid if you associate it with the last mmap that created that number. You don't even have to care about the unmap event, because that unmap - while it will potentially kill the lifetime of the hash if it was the last use of that file - also means that now there won't be any new stack traces with that hash any more. So you can ignore the lifetime in that respect: all that matters is that yes, it can get re-used, but you'll see a new mmap event with that hash if it is. (And then you might still have the complexity with per-cpu trace buffers etc where the ordering between an mmap event on one CPU might not be entirely obvious wrt the stack trace even on another CPU with a different thread that shares the same VM, but that's no different from any of the other percpu trace buffer ordering issues). Linus
On Fri, 29 Aug 2025 08:47:44 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > You don't even have to care about the unmap event, because that unmap > - while it will potentially kill the lifetime of the hash if it was > the last use of that file - also means that now there won't be any new > stack traces with that hash any more. So you can ignore the lifetime > in that respect: all that matters is that yes, it can get re-used, but > you'll see a new mmap event with that hash if it is. Basically what I need is that every time I add a file/hash mapping to the hashtable, I really need a callback to know when that file goes away. And then I can remove it from the hash table, so that the next time that hash is added, it will trigger another "print the file associated with this hash". It's OK to have the same hash for multiple files as long as it is traced. All events have timestamps associated to them, so it is trivial to map which hash mapping belongs to which stack trace. The reason for the file_cache is to keep from having to do the lookup of the file at every stack trace. But if I can have a callback for when that vma gets changed, (as I'm assuming the file will last as long as the vma is unmodified), then the callback could remove the hash value and this would not be a problem. My question now is, is there a callback that can be registered by the file_cache to know when the vma or the file change? -- Steve
On Fri, 29 Aug 2025 at 09:18, Steven Rostedt <rostedt@goodmis.org> wrote: > > Basically what I need is that every time I add a file/hash mapping to the > hashtable, I really need a callback to know when that file goes away. And > then I can remove it from the hash table, so that the next time that hash is > added, it will trigger another "print the file associated with this hash". That works, but why would you care? Why don't you just register the hash value and NOT CARE. Leave it all to later when the trace gets analyzed. Leave it be. The normal situation is presumably going to be that millions of stack traces will be generated, and nobody will even look at them. > My question now is, is there a callback that can be registered by the > file_cache to know when the vma or the file change? No. And what's the point? I just told you that unmap doesn't matter. All that matters is mmap. Don't try to "reuse" hashes. Just treat them as opaque numbers. Linus
On Fri, 29 Aug 2025 09:28:41 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > Don't try to "reuse" hashes. Just treat them as opaque numbers. What do I use to make the hash? One thing this is trying to do is not have to look up the path name for every line of a stack trace. I could just have every instance do the full look up, make a hash from the path name and build id, and pass the hash to the caller. My worry is the time it takes to generate that. Perhaps I could have a hash that maps the pid with the vma->vm_start, and if that's unique, get the path and build-id and create the hash for that and send it back to the user. Save the hash for that mapping in the rhashtable with the pid/vm_start as the key. Then the code that adds the vma, will see if the pid/vma->start exists, if it does, return the hash associated with that, if it does not, add it and trigger the event that a new address has been created. -- Steve
On Fri, 29 Aug 2025 at 09:49, Steven Rostedt <rostedt@goodmis.org> wrote: > > What do I use to make the hash? Literally just '(unsigned long)(vma->vm_file)'. Nothing else. > One thing this is trying to do is not have to look up the path name for > every line of a stack trace. That's the *opposite* of what I've been suggesting. I've literally been talking about just saving off the hash of the file pointer. (And I just suggested that what you actually save off isn't even the hash - just the value - and that you can hash it later at a less critical point in time) Don't do *any* work at all at trace collection time. All you need is to literally access three fields in the 'vma': - 'vm_start' and 'vm_pgoff' are needed to calculate the offset in the file using the user space address - save off the value of 'vm_file' for later hashing and I really think you're done. Then, for the actual trace, you need two things: - you need the mmap trace event that has the 'file' value, and you create a mmap event with that value hashed, and at that point you also output the pathname and/or things like the build ID - for the stack trace events, you output the offset in the file, and you hash and output the file value now, in user space, you have all you need. All you do is match the hashes. They are random numbers, and user space cannot know what they are. They are just cookies as a mapping ID. And look, now you have the pathname and the build ID - or whatever you saved off in that mmap event. And at stack trace time, you needed to do *nothing*. And mmap is rare enough - and heavy enough - that doing that pathname and build ID at *that* point is a non-issue. See what I'm trying to say? Linus
On August 29, 2025 1:59:21 PM GMT-03:00, Linus Torvalds <torvalds@linux-foundation.org> wrote: >On Fri, 29 Aug 2025 at 09:49, Steven Rostedt <rostedt@goodmis.org> wrote: >> >> What do I use to make the hash? > >Literally just '(unsigned long)(vma->vm_file)'. > >Nothing else. > >> One thing this is trying to do is not have to look up the path name for >> every line of a stack trace. > >That's the *opposite* of what I've been suggesting. I've literally >been talking about just saving off the hash of the file pointer. > >(And I just suggested that what you actually save off isn't even the >hash - just the value - and that you can hash it later at a less >critical point in time) > >Don't do *any* work at all at trace collection time. All you need is >to literally access three fields in the 'vma': > > - 'vm_start' and 'vm_pgoff' are needed to calculate the offset in the >file using the user space address > > - save off the value of 'vm_file' for later hashing > >and I really think you're done. > >Then, for the actual trace, you need two things: > > - you need the mmap trace event that has the 'file' value, and you >create a mmap event with that value hashed, and at that point you also >output the pathname and/or things like the build ID > > - for the stack trace events, you output the offset in the file, and >you hash and output the file value > >now, in user space, you have all you need. All you do is match the >hashes. They are random numbers, and user space cannot know what they >are. They are just cookies as a mapping ID. > >And look, now you have the pathname and the build ID - or whatever you >saved off in that mmap event. And at stack trace time, you needed to >do *nothing*. > >And mmap is rare enough - and heavy enough - that doing that pathname >and build ID at *that* point is a non-issue. Or using a preexisting one in the DSO used for the executable mmap. As long as we don't lose those mmap events due to memory pressure/lost events and we have timestamps to order it all before lookups, yeah should work. - Arnaldo > >See what I'm trying to say? > > Linus - Arnaldo
On Fri, 29 Aug 2025 at 10:18, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: > > As long as we don't lose those mmap events due to memory pressure/lost > events and we have timestamps to order it all before lookups, yeah > should work. The main reason to lose mmap events that I can see is that you start tracing in the middle of running something (for example, tracing systemd or some other "started at boot" thing). Then you'd not have any record of an actual mmap at all because it happened before you started tracing, even if there is no memory pressure or other thing going on. That is not necessarily a show-stopper: you could have some fairly simple count for "how many times have I seen this hash", and add a "mmap reminder" event (which would just be the exact same thing as the regular mmap event). You do it for the first time you see it, and every N times afterwards (maybe by simply using a counter array that is indexed by the low bits of the hash, and incrementing it for every hash you see, and if it was zero modulo N you do that "mmap reminder" thing). Yes, at that point you'd need to do that whole "generate path and build ID", but if 'N' is a large enough number, it's pretty rare. Maybe using a 16-bit counter would be sufficient (ie N would naturally be 65536 when it becomes zero again). That might be a good thing regardless just to have some guaranteed limit of how far back in the trace you need to go to find the mmap information for some hash. If you have long traces, maybe you don't want to walk back billions of events. But I wouldn't suggest doing that as a *first* implementation. I'm just saying that it could be added if people find that it's a problem. Linus
On Fri, 29 Aug 2025 10:33:38 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Fri, 29 Aug 2025 at 10:18, Arnaldo Carvalho de Melo > <arnaldo.melo@gmail.com> wrote: > > > > As long as we don't lose those mmap events due to memory pressure/lost > > events and we have timestamps to order it all before lookups, yeah > > should work. > > The main reason to lose mmap events that I can see is that you start > tracing in the middle of running something (for example, tracing > systemd or some other "started at boot" thing). Note, for on-demand tracing, the applications are already running before the tracing starts. That is actually the common case. Yes, people do often "enabled tracing, run my code, stop tracing", but most of the use cases I deal with, it's (we are noticing something in the field, start tracing, issue gets hit, stop tracing), where the applications we are monitoring are already running when the tracing started. Just tracing the mmap when it happens will not be useful for us. Not to mention, in the future, this will also have to work with JIT. I was thinking of using 64 bit hashes in the stack trace, where the top bits are reserved for context (is this a file, or something dynamically created). > > Then you'd not have any record of an actual mmap at all because it > happened before you started tracing, even if there is no memory > pressure or other thing going on. > > That is not necessarily a show-stopper: you could have some fairly > simple count for "how many times have I seen this hash", and add a > "mmap reminder" event (which would just be the exact same thing as the > regular mmap event). I thought about clearing the file cache periodically, if for any other reason, but for dropped events where the mapping is lost. This is why I'm looking at clearing on "unmap". Yes, we don't care about unmap, but as soon as an unmap happens if that value gets used again then we know it's a new mapping. That is, dropped the hashes out of the file cache when they are no longer around. The idea is this (pseudo code): user_stack_trace() { foreach vma in each stack frame: key = hash(vma->vm_file); if (!lookup(key)) { trace_file_map(key, generate_path(vma), generate_buildid(vma)); add_into_hash(key); } } } On unmmaping: key = hash(vma->vm_file); remove_from_hash(key); Now if a new mmap happens where the vma->vm_file is reused, the lookup(key) will return false again and the file_map event will get triggered again. We don't need to look at the mmap() calls, as those new mappings may never end up in a user stack trace, and writing them out will just waste space in the ring buffer. -- Steve
On Fri, 29 Aug 2025 at 11:11, Steven Rostedt <rostedt@goodmis.org> wrote: > > The idea is this (pseudo code): > > user_stack_trace() { > foreach vma in each stack frame: > key = hash(vma->vm_file); > if (!lookup(key)) { > trace_file_map(key, generate_path(vma), generate_buildid(vma)); > add_into_hash(key); > } > } I see *zero* advantage to this. It's only doing stupid things that cost extra, and only because you don't want to do the smart thing that I've explained extensively that has *NONE* of these overheads. Just do the parsing at parse time. End of story. Or don't do this at all. Justy forget the whole thing entirely. Throw the patch that started this all away, and just DON'T DO THIS. Linus
On Fri, 29 Aug 2025 13:54:08 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Fri, 29 Aug 2025 at 11:11, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > The idea is this (pseudo code): > > > > user_stack_trace() { > > foreach vma in each stack frame: > > key = hash(vma->vm_file); > > if (!lookup(key)) { > > trace_file_map(key, generate_path(vma), generate_buildid(vma)); > > add_into_hash(key); > > } > > } > > I see *zero* advantage to this. It's only doing stupid things that > cost extra, and only because you don't want to do the smart thing that > I've explained extensively that has *NONE* of these overheads. > > Just do the parsing at parse time. End of story. What does "parsing at parse time" mean? > > Or don't do this at all. Justy forget the whole thing entirely. Throw > the patch that started this all away, and just DON'T DO THIS. Maybe we are talking past each other. When I get a user space stack trace, I get the virtual addresses of each of the user space functions. This is saved into an user stack trace event in the ring buffer that usually gets mapped right to a file for post processing. I still do the: user_stack_trace() { foreach addr each stack frame vma = vma_lookup(mm, addr); callchain[i++] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT); Are you saying that this shouldn't be done either? And to just record the the virtual address in the chain and the vma->vm_start and vma->vm_pgoff in another event? Where the post processing could do the math? This other event could also record the path and build id. The question is, when do I record this vma event? How do I know it's new? I can't rely too much on other events (like mmap) and such as those events may have occurred before the tracing started. I have to have some way to know if the vma has been saved previously, which was why I had the hash lookup, and only add vma's on new instances. My main question is, when do I record the vma data event? -- Steve
On Fri, 29 Aug 2025 at 14:18, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > Just do the parsing at parse time. End of story. > > What does "parsing at parse time" mean? In user space. When parsing the trace events. Not in kernel space, when generating the events. Arnaldo already said it was workable. > When I get a user space stack trace, I get the virtual addresses of each of > the user space functions. This is saved into an user stack trace event in > the ring buffer that usually gets mapped right to a file for post > processing. > > I still do the: > > user_stack_trace() { > foreach addr each stack frame > vma = vma_lookup(mm, addr); > callchain[i++] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT); > > Are you saying that this shouldn't be done either? I'm saying that's ALL that should be done. And then that *ONE* single thing: callchain_filehash[i++] = hash(vma->vm_file); BUT NOTHING ELSE. None of that trace_file_map() garbage. None of that add_into_hash() garbage. NOTHING like that. You don't look at the hash. You don't "register" it. You don't touch it in any way. You literally just use it as a value, and user space will figure it out later. At event parsing time. At most, you could have some trivial optimization to avoid hashing the same pointer twice, ie have some single-entry cache of "it's still the same file pointer, I'll just use the same hash I calculated last time". And I mean *single*-level, because siphash is fast enough that doing anything *more* than that is going to be slower than just re-calculating the hash. In fact, you should probably do that optimization at the whole vma_lookup() level, and try to not look up the same vma multiple times when a common situation is probably that you'll have multiple stack frames all with entries pointing to the same executable (or library) mapping. Because "vma_lookup()" is likely about as expensive as the hashing is. Linus
On Fri, 29 Aug 2025 15:40:07 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Fri, 29 Aug 2025 at 14:18, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > > > Just do the parsing at parse time. End of story. > > > > What does "parsing at parse time" mean? > > In user space. When parsing the trace events. > > Not in kernel space, when generating the events. > > Arnaldo already said it was workable. Perf does do things differently, as I believe it processes the events as it reads from the kernel (Arnaldo correct me if I'm wrong). For the tracefs code, the raw data gets saved directly into a file, and the processing happens after the fact. If a tool is recording, it still needs a way to know what those hash values mean, after the tracing is complete. Same for when the user cats the "trace" file. If the vma's have already been freed, when this happens, how do we map these hashes from the vma? Do we need to have trace events in the unmap to trigger them? If tracing is not recording anymore, those events will be dropped too. Also, we only want to record the vmas that are in the stack traces. Not just any vma. > > > When I get a user space stack trace, I get the virtual addresses of each of > > the user space functions. This is saved into an user stack trace event in > > the ring buffer that usually gets mapped right to a file for post > > processing. > > > > I still do the: > > > > user_stack_trace() { > > foreach addr each stack frame > > vma = vma_lookup(mm, addr); > > callchain[i++] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT); > > > > Are you saying that this shouldn't be done either? > > I'm saying that's ALL that should be done. And then that *ONE* single thing: > > callchain_filehash[i++] = hash(vma->vm_file); > > BUT NOTHING ELSE. > > None of that trace_file_map() garbage. > > None of that add_into_hash() garbage. > > NOTHING like that. You don't look at the hash. You don't "register" > it. You don't touch it in any way. You literally just use it as a > value, and user space will figure it out later. At event parsing time. I guess this is where I'm stuck. How does user space know what those hash values mean? Where does it get the information from? > > At most, you could have some trivial optimization to avoid hashing the > same pointer twice, ie have some single-entry cache of "it's still the > same file pointer, I'll just use the same hash I calculated last > time". > > And I mean *single*-level, because siphash is fast enough that doing > anything *more* than that is going to be slower than just > re-calculating the hash. > > In fact, you should probably do that optimization at the whole > vma_lookup() level, and try to not look up the same vma multiple times > when a common situation is probably that you'll have multiple stack > frames all with entries pointing to the same executable (or library) > mapping. Because "vma_lookup()" is likely about as expensive as the > hashing is. Yeah, we could add an optimization to store vma's in the callchain walk to see if the next call chain belongs to a previous one. Could even just cache the previous vma, as it's not as common to have one library calling into another and back again. That is, this would likely be useful: vma = NULL; foreach addr in callchain if (!vma || addr not in range of vma) vma = vma_lookup(addr); -- Steve
On Fri, 29 Aug 2025 at 16:09, Steven Rostedt <rostedt@goodmis.org> wrote: > > Perf does do things differently, as I believe it processes the events as it > reads from the kernel (Arnaldo correct me if I'm wrong). > > For the tracefs code, the raw data gets saved directly into a file, and the > processing happens after the fact. If a tool is recording, it still needs a > way to know what those hash values mean, after the tracing is complete. But the data IS ALL THERE. Really. That's the point. It's there in the same file, it just needs those mmap events that whoever pasrses it - whether it be perf, or somebody reading some tracefs code - sees the mmap data, sees the cookies (hash values) that implies, and then matches those cookies with the subsequent trace entry cookies. But what it does *NOT* need is munmap() events. What it does *NOT* need is translating each hash value for each entry by the kernel, when whoever treads the file can just remember and re-create it in user space. I'm done arguing. You're not listening, so I'll just let you know that I'm not pulling garbage. I've had enough garbage in tracefs, I'm still smarting from having to fix up the horrendous VFS interfaces, I'm not going to pull anything that messes up this too. Linus
On Fri, 29 Aug 2025 17:45:39 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > But what it does *NOT* need is munmap() events. > > What it does *NOT* need is translating each hash value for each entry > by the kernel, when whoever treads the file can just remember and > re-create it in user space. If we are going to rely on mmap, then we might as well get rid of the vma_lookup() altogether. The mmap event will have the mapping of the file to the actual virtual address. If we add a tracepoint at mmap that records the path and the address as well as the permissions of the mapping, then the tracer could then trace only those addresses that are executable. To handle missed events, on start of tracing, trigger the mmap event for every currently running tasks for their executable sections, and that will allow the tracer to see where the files are mapped. After that, the stack traces can go back to just showing the virtual addresses of the user space stack without doing anything else. Let the trace map the tasks memory to all the mmaps that happened and translate it that way. The downside is that there may be a lot of information to record. But the tracer could choose which task maps to trace via filters and if it's tracing all tasks, it just needs to make sure its buffer is big enough. -- Steve
On Sat, 30 Aug 2025 at 11:31, Steven Rostedt <rostedt@goodmis.org> wrote: > > If we are going to rely on mmap, then we might as well get rid of the > vma_lookup() altogether. The mmap event will have the mapping of the > file to the actual virtual address. It actually won't - not unless you also track every mremap etc. Which is certainly doable, but I'd argue that it's a lot of complexity. All you really want is an ID for the file mapping, and yes, I agree that it's very very annoying that we don't have anything that can then be correlated to user space any other way than also having a stage that tracks mmap. I've slept on it and tried to come up with something, and I can't. As mentioned, the inode->i_ino isn't actually exposed to user space as such at all for some common filesystems, so while it's very traditional, it really doesn't actually work. It's also almost impossible to turn into a path, which is what you often would want for many cases. That said, having slept on it, I'm starting to come around to the inode number model, not because I think it's a good model - it really isn't - but because it's a very historical mistake. And in particular, it's the same mistake we made in /proc/<xyz>/maps. So I think it's very very wrong, but it does have the advantage that it's a number that we already do export. But the inode we expose that way isn't actually the 'vma->vm_file->f_inode' as you'd think, it's actually inode = file_user_inode(vma->vm_file); which is subtly different for the backing inode case (ie overlayfs). Oh, how I dislike that thing, but using the same thing as /proc/<xyz>/maps does avoid some problems. Linus
On Sat, 30 Aug 2025 12:03:53 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Sat, 30 Aug 2025 at 11:31, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > If we are going to rely on mmap, then we might as well get rid of the > > vma_lookup() altogether. The mmap event will have the mapping of the > > file to the actual virtual address. > > It actually won't - not unless you also track every mremap etc. > > Which is certainly doable, but I'd argue that it's a lot of complexity. > > All you really want is an ID for the file mapping, and yes, I agree > that it's very very annoying that we don't have anything that can then > be correlated to user space any other way than also having a stage > that tracks mmap. > > I've slept on it and tried to come up with something, and I can't. As > mentioned, the inode->i_ino isn't actually exposed to user space as > such at all for some common filesystems, so while it's very > traditional, it really doesn't actually work. It's also almost > impossible to turn into a path, which is what you often would want for > many cases. > > That said, having slept on it, I'm starting to come around to the > inode number model, not because I think it's a good model - it really > isn't - but because it's a very historical mistake. > > And in particular, it's the same mistake we made in /proc/<xyz>/maps. > > So I think it's very very wrong, but it does have the advantage that > it's a number that we already do export. > > But the inode we expose that way isn't actually the > 'vma->vm_file->f_inode' as you'd think, it's actually > > inode = file_user_inode(vma->vm_file); > > which is subtly different for the backing inode case (ie overlayfs). > > Oh, how I dislike that thing, but using the same thing as > /proc/<xyz>/maps does avoid some problems. > Sorry for the late reply. I left to the Tracing Summit the following Monday, and when I got back home on Thursday, I came down with a nasty cold that prevented me from thinking about any of this. I just re-read the entire thread, and I'm still not sure where to go with this. Thus, let me start with what I'm trying to accomplish, and even add one example of a real world use case we would like to have. Several times we find issues with futexes causing applications to either lock up or cause long latency. Since a futex is mostly managed in user space, it's good to be able to at least have a backtrace of where a contended futex occurs. Thus we start tracing the futex system call and triggering a user space backtrace on each one. Using this information can help us figure out where the futex contention lies. This is just one use case, we do have others. Ideally, the user space stack trace should look like: futex_requeue-1044 [002] ..... 168.761423: <user stack unwind> cookie=31500000003 => <000000000009a9ee> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab} => <0000000000001472> : path=/work/c/futex_requeue build_id={0xc02417ea,0x1f4e0143,0x338cf27d,0x506a7a5d,0x7884d090} => <0000000000092b7b> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab} Where the above shows the callstack (offset from the file), the path to the file, and a build id of that file such that the tooling can verify that the path is indeed the same library/executable as for when the trace occurred. Note, the build-id isn't really necessary for my own use case, because the applications seldom change on a chromebook. I added it as it appears to be useful for others I've talked to that would like to use this. But printing a copy of the full path name and build-id at every stack trace is expensive. The path lookup may not be so bad, but the space on the ring buffer is. To compensate this, we could replace the path and build-id with a unique identifier, (being an inode/device or hash, or whatever) to associate that file. It may even work if it is unique per task. Then whenever one of these identifiers were to show up representing a new file, it would be printed. We could monitor an event that if a file is deleted, renamed, or whatever, and a new file with the same name comes around, the identifier with the path and build-id gets printed for the new file. Where the output would be, instead: sed-1037 [007] ...1. 167.362583: file_map: hash=0x51eff94b path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab} [..] futex_requeue-1042 [007] ...1. 168.754128: file_map: hash=0xad2c6f1b path=/work/c/futex_requeue build_id={0xc02417ea,0x1f4e0143,0x338cf27d,0x506a7a5d,0x7884d090} [..] futex_requeue-1042 [007] ..... 168.757912: <user stack unwind> cookie=34900000008 => <00000000001001ca> : 0x51eff94b => <000000000000173c> : 0xad2c6f1b => <0000000000029ca8> : 0x51eff94b [.. repeats several more traces without having to save the path names again ..] It comes down to when do we print these mappings? I noticed that uprobes has hooks to all the mmappings in the vma code as it needs to keep track of them. We could change those hooks to tracepoints, and have both uprobes and tracing monitor the changes, and when a new mapping happens, it traces it. Changing them to tracepoints may be useful anyway, as it would then turn them over to static branchs and not a normal "if" statement. We could even add a file to tracefs that would trigger the dump of all files that are mapped executable for all currently running tasks.Then when tracing starts, it would trigger the "show all currently running task mappings" and then only do the mappings on demand. This way, the tracer would get the mappings of the identifier (or hash, or whatever) to the files and build-ids at the start of tracing, as well as get any of the mappings when they happen later on. This should have enough information for the post processing to put the stack traces back to what is ideal in the first place. That is, the tooling could output: futex_requeue-1044 [002] ..... 168.761423: <user stack unwind> cookie=31500000003 => <000000000009a9ee> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab} => <0000000000001472> : path=/work/c/futex_requeue build_id={0xc02417ea,0x1f4e0143,0x338cf27d,0x506a7a5d,0x7884d090} => <0000000000092b7b> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab} and hide the identifier that was used in the ring buffer. -- Steve
On Mon, 8 Sept 2025 at 14:42, Steven Rostedt <rostedt@goodmis.org> wrote: > > I just re-read the entire thread, and I'm still not sure where to go with > this. So honestly, I don't know how to get where you want to get - or whether it's even *possible* without horrible performance impact. And no, we're not adding crap interfaces to mmap/munmap just for a stupid sysfs tracing thing. > Ideally, the user space stack trace should look like: > > futex_requeue-1044 [002] ..... 168.761423: <user stack unwind> cookie=31500000003 > => <000000000009a9ee> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab} > => <0000000000001472> : path=/work/c/futex_requeue build_id={0xc02417ea,0x1f4e0143,0x338cf27d,0x506a7a5d,0x7884d090} > => <0000000000092b7b> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab} Yes. And I think that's what you should aim to generate. Not inode numbers, because inode numbers are the wrong thing. > Note, the build-id isn't really necessary for my own use case, because the > applications seldom change on a chromebook. I added it as it appears to be > useful for others I've talked to that would like to use this. My personal suspicion is that in reality, the pathname is sufficient. It's certainly a lot better than inode numbers are, in that the pathname is meaningful even after-the-fact, and even on a different machine etc. It's not some guaranteed match with some particular library or executable version, no. But for some random one-time quick scripting thing that uses sysfs, it's probably "good enough". The build id is certainly very convenient too, but it's not *always* convenient. And 99% of the time you could just look up the build id from the path, even though obviously that wouldn't work across machines and wouldn't work across system updates. > But printing a copy of the full path name and build-id at every stack trace > is expensive. The path lookup may not be so bad, but the space on the ring > buffer is. So that's the thing. You can do it right, or you can do it wrong. I'd personally tend to prefer the "expensive but right", and just make it a trace-time option. > To compensate this, we could replace the path and build-id with a unique > identifier, (being an inode/device or hash, or whatever) to associate that > file. It may even work if it is unique per task. Then whenever one of these > identifiers were to show up representing a new file, it would be printed. So I really hate the inode number, because it's just wrong. You can't match it across machines, and to make things worse it's not even *meaningful* over time or over machines - or to humans - so it's strictly clearly objectively worse than the pathname. But more importanly - rven on the *local* machine - and at the moment - it's actually wrong. Exactly because the inode number you look up is *not* the user-visible inode number from 'stat()'. So it's *really* wrong to use the inode number. It's basically never right. And bever will be, even if you can make it appear useful in some specific cases. The *one* saving grace for the inode number is that *in*the*moment* you can match it against /proc/<pid>/maps, because that /proc file has that historical bug too (it wasn't buggy at the time that /proc file was introduced, but our filesystems have become much more complex since). So if you do that inode = file_user_inode(vma->vm_file); that I mentioned, at least the otherwise random inode numbers can be matched to *something*. That still doesn't fix the other issues with inode numbers, but it means that at the time of the trace - and on the machine that the tracing is done - you can now match that not-quite-real inode number and device against another /proc file, and turn it into a pathname. But it's kind of sad to do that, when you could just do the pathname in the trace directly, and not force the stupid interface in the first place. And honestly, at that point it's still not really *better* than the pathname (and arguably much much worse, because you might not be able to do the matching if you didn't catch the /proc/<pid>/maps file). So the inode number - together with a lookup in /proc/<pid>/maps - is generally about the same as just giving a path, but typically much less convenient, and anybody using that interface would have to do extra work in user space. And *none* of these issues would be true of somebody who uses the 'perf()' interface that can do all of this much more efficiently, and without the downsides, and without any artificially limited sysfs interfaces. So that really makes me go: just don't expose this at all in sysfs files. You *cannot* do a good job in sysfs, because the interface is strictly worse than just doing the proper job using perf. Alternatively, just do the expensive thing. Expose the actual pathname, and expose the build ID. Yes, it's expensive, but dammit, that's the whole *point* of tracing in sysfs. sysfs was never about being efficient, it was about convenience. So if you trace through sysfs, you either don't get the full information that could be there, or pay the price for the expense of generating the full info. Make the "give me the expensive output" be a dynamic flag, so that you don't do it by default, but if you have some model where you are scripting things with shell-script rather than doing 'perf record', at least you get good output. Hmm? Linus
On Mon, 8 Sep 2025 16:09:50 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > > To compensate this, we could replace the path and build-id with a unique > > identifier, (being an inode/device or hash, or whatever) to associate that > > file. It may even work if it is unique per task. Then whenever one of these > > identifiers were to show up representing a new file, it would be printed. > > So I really hate the inode number, because it's just wrong. I just mentioned an identifier, it didn't need to be the inode. > > So if you do that > > inode = file_user_inode(vma->vm_file); And if I do end up using an inode, I'll make sure to use that. > And *none* of these issues would be true of somebody who uses the > 'perf()' interface that can do all of this much more efficiently, and > without the downsides, and without any artificially limited sysfs > interfaces. Note, there is no user space component running during the trace when tracing with tracefs, whereas perf requires a user space tool to be running along with what is being traced. The idea, is not to affect what is being traced by a user space tracer. Tracing is started when needed, and when the anomaly is detected, tracing is stopped, and then the tooling extracts the trace and post processes it. > > So that really makes me go: just don't expose this at all in sysfs > files. You *cannot* do a good job in sysfs, because the interface is > strictly worse than just doing the proper job using perf. > > Alternatively, just do the expensive thing. Expose the actual > pathname, and expose the build ID. Yes, it's expensive, but dammit, > that's the whole *point* of tracing in sysfs. sysfs was never about > being efficient, it was about convenience. Technically, it's "tracefs" and not "sysfs". When tracefs is configured, sysfs will create a directory called /sys/kernel/tracing to allow user space to mount tracefs there, but it is still a separate file system which can be mounted outside of sysfs. The code in tracefs is designed to be very efficient and tries very hard to keep the overhead down. The tracefs ring buffer is still 2 or 3 times faster than the perf buffer. It is optimized for tracing, and that's even why it doesn't rely on a user space component, as it's another way to allow always-on-tracing to not affect the system as much while tracing. > > So if you trace through sysfs, you either don't get the full > information that could be there, or pay the price for the expense of > generating the full info. But I will say the time to get the path name isn't an issue here. It's the size of the path name being recorded into the ring buffer. The ring buffer's size is limited, and a lot of compaction techniques are used to try to use it efficiently. As the stack trace only happens when the task goes back to user space, it's not as a time sensitive event as say function tracing. Thus spending a few more microseconds on something isn't going to cause much of a notice. An 8 byte identifier for a file is much better than the path name where it can be 40 bytes or more. In my example: /usr/lib/x86_64-linux-gnu/libselinux.so.1 is 41 bytes, 42 if you count the nul terminating byte. Now if we just hash the path name, that would be doable. Then when we see a new name pop up, we can trigger another event to show the path name (and perhaps even the build id). What's nice about triggering another event to show the full path name, is that you can put that other event into another buffer, to keep the path names from dropping stack traces, and vice versa. I liked an idea you had in a previous email: https://lore.kernel.org/all/CAHk-=wjgdKtBAAu10W04VTktRcgEMZu+92sf1PW-TV-cfZO3OQ@mail.gmail.com/ You do it for the first time you see it, and every N times afterwards (maybe by simply using a counter array that is indexed by the low bits of the hash, and incrementing it for every hash you see, and if it was zero modulo N you do that "mmap reminder" thing). Instead of a N counter, have a time expiry of say 200 milliseconds or so. At the stack trace, look at the path name, hash it, put it into a look up table, and if it's not there, trigger the other event to show the path name and save it and a timestamp into the look up table. If it's in the look up table, and the timestamp is greater than 200 milliseconds, trigger it again and reset the timestamp. The idea is only to remove duplicates, and move the longer names into a separate buffer. Recording full path names in every stack trace will make it much harder to use the information as it will lose a lot more stack traces. > > Make the "give me the expensive output" be a dynamic flag, so that you > don't do it by default, but if you have some model where you are > scripting things with shell-script rather than doing 'perf record', at > least you get good output. Note, it's not a shell script. We do have tooling, it's just that nothing runs while the trace is happening. The tooling starts the trace and exits. When the issue is discovered, tracing is stopped, and the tooling will then extract the trace and process it. If a crash occurs, the persistent ring buffer can be extracted to get the data. We will use this in the field. That is, on chromebooks where people have opted in to allow analysis of their machines. If there's an anomaly detected in thousands of users, we can start tracing and then extract the traces to debug what is happening on their machines. We want to make sure we get enough stack traces that will go back far enough to where the issue first occurred. Hence why we want to keep the traces small and compact. -- Steve
On Mon, 8 Sept 2025 at 16:09, Linus Torvalds <torvalds@linux-foundation.org> wrote: > > Make the "give me the expensive output" be a dynamic flag, so that you > don't do it by default, but if you have some model where you are > scripting things with shell-script rather than doing 'perf record', at > least you get good output. Side note: you could make that dynamic flag be basically "per-target", by having the *tracer* open the files that it wants to match against, and guaranteeing that the dentry stays around by virtue of having opened the files. Then - I'm handwaving a bit here - you could have some "hash the dentry pointer" model. In that model, you couldn't use the 'struct file' hash, because now you're matching against different 'open()' cases: the tracer that uses sysfs would open the executable and the libraries it knows it is going to trace, and keep them open for the duration of the trace in order to have stable hashes for those files. All the tracer would need is some simple interface to "give me the hash for the file I just opened", and then it could easily match that against any hashes it sees in sysfs stack traces. The advantage of this model is that now the tracer not only has the hash, and controls the lifetime, it means that the tracer also can decide to look up build IDs etc if it wants to. The disadvantage is obvious, though: this requires that the tracer know what the files in question are. Of course, that's usually not that hard. You might literally just know it a-priori (ie just from what you are tracing along with having run 'ldd' etc), but for the 'I'm tracing a running process' you can use that /proc/<pid>/maps file to start populating your hash information. I'm *not* claiming this is a wonderful interface, but it's at least a *fairly* straightforward way to give some kind of cheap hash ID for the user space traces, and it puts the burden of "hash lifetime" clearly on user space, not on the kernel having to try to maintain some kind of hash map. In other words: if user space wants to get good information, maybe user space needs to work at it a bit. The kernel side shouldn't be made complicated or be expected to bend over backwards. Linus
On August 30, 2025 3:31:14 PM GMT-03:00, Steven Rostedt <rostedt@goodmis.org> wrote: >On Fri, 29 Aug 2025 17:45:39 -0700 >Linus Torvalds <torvalds@linux-foundation.org> wrote: > >> But what it does *NOT* need is munmap() events. >> >> What it does *NOT* need is translating each hash value for each entry >> by the kernel, when whoever treads the file can just remember and >> re-create it in user space. > >If we are going to rely on mmap, then we might as well get rid of the >vma_lookup() altogether. The mmap event will have the mapping of the >file to the actual virtual address. > >If we add a tracepoint at mmap that records the path and the address as >well as the permissions of the mapping, then the tracer could then >trace only those addresses that are executable. > PERF_RECORD_MMAP2 (MMAP had just the filename); <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/perf_event.h#n1057> >To handle missed events, on start of tracing, trigger the mmap event >for every currently running tasks for their executable sections, and >that will allow the tracer to see where the files are mapped. Perf does synthesize the needed mmap events by traversing procfs, if needed. Jiri at some point toyed with BPF iterators to do as you suggest: from the kernel iterate task structs and generate the PERF_RECORD_MMAP2 for preexisting processes. >After that, the stack traces can go back to just showing the virtual >addresses of the user space stack without doing anything else. Let the >trace map the tasks memory to all the mmaps that happened and translate >it that way. > >The downside is that there may be a lot of information to record. It is, but for system wide cases, etc. Want to see it all? There's a cost... - Arnaldo But >the tracer could choose which task maps to trace via filters and if it's >tracing all tasks, it just needs to make sure its buffer is big enough. > >-- Steve - Arnaldo
On Fri, 29 Aug 2025 17:45:39 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Fri, 29 Aug 2025 at 16:09, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > Perf does do things differently, as I believe it processes the events as it > > reads from the kernel (Arnaldo correct me if I'm wrong). > > > > For the tracefs code, the raw data gets saved directly into a file, and the > > processing happens after the fact. If a tool is recording, it still needs a > > way to know what those hash values mean, after the tracing is complete. > > But the data IS ALL THERE. But only in the kernel. How do I expose it? > > Really. That's the point. > > It's there in the same file, it just needs those mmap events that > whoever pasrses it - whether it be perf, or somebody reading some What mmap events are you talking about? Nothing happens to be tracing mmap events. An interrupt triggered, we want a user space stack trace for that interrupt, it records the kernel stack trace and a cookie that gets matched to the user stack trace. It is then deferred until it goes back to user space and the deferred infrastructure does a callback to the tracer with the list of addresses that represent the user space call stack. We do a vma_lookup() to get the vma of each of those addresses. Now we make some hash that represents that vma for each address. But there has been no event that maps to this vma to what the file is. And the vma's in these stack traces are a subset of all the vma's. When the user finally gets around to reading them, the vmas could be long gone. How is user space supposed to find out what files they belong to? Do we need to record most events to grab all the vma's and the files they belong to? Note, one of the constraints to tracing is the buffer size. We don't want to be recording information that we don't care about. > tracefs code - sees the mmap data, sees the cookies (hash values) that > implies, and then matches those cookies with the subsequent trace > entry cookies. That was basically what I was doing with the vma hash table. To print out the vmas as soon as a new one is referenced. It created the event needed, and only for the vmas we care about. > > But what it does *NOT* need is munmap() events. This wouldn't be recording munmap events. It would use the unmap event to callback and remove the vma from the hash when they happened, so that if they get reused the new ones would be printed. It's no different if we use munmap or mmap. I could hook into the mmap event instead and check if it is in the vma hash and if so, either reprint it, or remove it so if the vma is in a call stack it would get reprinted. Writing the file for every mmap seems to be a waste of ring buffer space if the majority of them is not going to be in a stack trace. > > What it does *NOT* need is translating each hash value for each entry > by the kernel, when whoever treads the file can just remember and > re-create it in user space. What's reading the files? The applications that are being traced? > > I'm done arguing. You're not listening, so I'll just let you know that I am listening. I'm just not understanding you. > I'm not pulling garbage. I've had enough garbage in tracefs, I'm still > smarting from having to fix up the horrendous VFS interfaces, I'm not > going to pull anything that messes up this too. I know you keep bringing up the tracefs eventfs issue. Hey, I asked for help with that when I first started it. I was basically told by some of the VFS folks (I'm not going to name names) that "don't worry, if it works it's fine". I was very worried that I wasn't doing it right. And it wasn't until you got involved where you were the first one to tell me that using dentry outside of VFS was a bad idea. Most of our arguing then was because I didn't understand that. That also lead to the "garbage" code you had to fix up. So keep bringing that up. It just shows how much of tribal knowledge is needed to work in the kernel. Heck, the VFS folks are still arguing about how to handle things like kernfs. Which is similar to the eventfs issue. And that boils down to things like kernefs, eventfs and procfs have a fundamental difference to all other file systems. And that is it's a file interface to the kernel itself, and not some external source. I realized this during our arguments over eventfs. You do a write or read from a file, and unlike other file systems, those actions trigger kernel functions outside of vfs. But this is another topic altogether, and I only brought it up because you did. -- Steve
On Fri, 29 Aug 2025 21:20:23 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > > I'm done arguing. You're not listening, so I'll just let you know that > > I am listening. I'm just not understanding you. BTW, I'm not arguing with you. I'm really trying hard to figure out what it is that you want me to do. I'm looking for that "don't use dentry outside of VFS" moment. I get we have in the call stack the offsets of the file and a magical hash value that represents that vma. What I don't get is what is user space suppose to match that magical hash value to? Do you want me to trace all mmaps and trigger an event for them that show the hash value and the path names? If that's the case, what do I do about the major use case of tracing an application after it has mapped all it's memory to files? What about wasted ring buffer space for recording every mmap when the majority of them will not be used. It could risk dropping events of the mmaps we care about. Again, I'm not arguing with you. I'm trying to figure out what you are suggesting. -- Steve
On Fri, 29 Aug 2025 19:09:35 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > Yeah, we could add an optimization to store vma's in the callchain walk to > see if the next call chain belongs to a previous one. Could even just cache > the previous vma, as it's not as common to have one library calling into > another and back again. Although, it does happen with libc. :-p cookie=300000004 => <000000000008f687> : 0x666220af => <0000000000014560> : 0x88512fee => <000000000001f94a> : 0x88512fee => <000000000001fc9e> : 0x88512fee => <000000000001fcfa> : 0x88512fee => <000000000000ebae> : 0x88512fee => <0000000000029ca8> : 0x666220af The 0x666220af is libc, where the first item is (according to objdump): 000000000008f570 <__libc_alloca_cutoff@@GLIBC_PRIVATE>: And the last one (top of the stack) is: 0000000000029c20 <__libc_init_first@@GLIBC_2.2.5>: Of course libc starts the application, and then the application will likely call back into libc. We could optimize for this case with: first_vma = NULL; vma = NULL; foreach addr in callchain if (!first_vma) vma = first_vma = vma_alloc() else if (addr in range of first_vma) vma = first_vma else (addr not in range of vma) vma = vma_lookup(addr); -- Steve
On Fri, 29 Aug 2025 19:09:35 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > > NOTHING like that. You don't look at the hash. You don't "register" > > it. You don't touch it in any way. You literally just use it as a > > value, and user space will figure it out later. At event parsing time. > > I guess this is where I'm stuck. How does user space know what those hash > values mean? Where does it get the information from? This is why I had the stored hash items in a "file_cache". It was the way to know that a new vma is being used and needs an event to show it. > > That is, this would likely be useful: > > vma = NULL; > foreach addr in callchain > if (!vma || addr not in range of vma) > vma = vma_lookup(addr); You already stated that the vma_lookup() and the hash algorithm is very expensive, but they need to be done anyway. A simple hash lookup is quick and would be lost in the noise. vma = NULL; hash = 0; foreach addr in callchain if (!vma || addr not in range of vma) { vma = vma_lookup(addr); hash = get_hash(vma); } callchain[i] = addr - offset; hash[i] = hash; I had that get_hash(vma) have something like: u32 get_hash(vma) { unsigned long ptr = (unsigned long)vma->vm_file; u32 hash; /* Remove alignment */ ptr >>= 3; hash = siphash_1u32((u32)ptr, &key); if (lookup_hash(hash)) return hash; // already saved // The above is the most common case and is quick. // Especially compared to vma_lookup() and the hash algorithm /* Slow but only happens when a new vma is discovered */ trigger_event_that_maps_hash_to_file_data(hash, vma); /* Doesn't happen again for this hash value */ save_hash(hash); This is also where I would have a callback from munmap() to remove the vmas from this hash table because they are no longer around. And if a new vma came around with the same vm_file address, it would not be found in the hash table and would trigger the print again with the hash and the new file it represents. This "garbage" was how I implemented the way to let user space know what the meaning of the hash values are. Otherwise, we need something else to expose to user space what those hashes mean. And that's where I don't know what you are expecting. -- Steve
On Fri, 29 Aug 2025 19:42:46 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > vma = NULL; > hash = 0; > foreach addr in callchain > if (!vma || addr not in range of vma) { > vma = vma_lookup(addr); > hash = get_hash(vma); > } > callchain[i] = addr - offset; > hash[i] = hash; > > > I had that get_hash(vma) have something like: > > > u32 get_hash(vma) { > unsigned long ptr = (unsigned long)vma->vm_file; > u32 hash; > > /* Remove alignment */ > ptr >>= 3; > hash = siphash_1u32((u32)ptr, &key); Oh, this hash isn't that great, as it did appear to have collisions. But I saw in vsprintf() it has something like: #ifdef CONFIG_64BIT return (u32)(unsigned long)siphash_1u64((u64)ptr, &key); #else return (u32)siphash_1u32((u32)ptr, &key); #endif Which for the 64 bit version, it uses all the bits to calculate the hash, and the resulting bottom 32 is rather a good spread. > > if (lookup_hash(hash)) > return hash; // already saved > > // The above is the most common case and is quick. > // Especially compared to vma_lookup() and the hash algorithm > > /* Slow but only happens when a new vma is discovered */ > trigger_event_that_maps_hash_to_file_data(hash, vma); > > /* Doesn't happen again for this hash value */ > save_hash(hash); So this basically creates the output of: trace-cmd-1034 [003] ..... 142.197674: <user stack unwind> cookie=300000004 => <000000000008f687> : 0x666220af => <0000000000014560> : 0x88512fee => <000000000001f94a> : 0x88512fee => <000000000001fc9e> : 0x88512fee => <000000000001fcfa> : 0x88512fee => <000000000000ebae> : 0x88512fee => <0000000000029ca8> : 0x666220af trace-cmd-1034 [003] ...1. 142.198063: file_cache: hash=0x666220af path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} trace-cmd-1034 [003] ...1. 142.198093: file_cache: hash=0x88512fee path=/usr/local/bin/trace-cmd build_id={0x3f399e26,0xf9eb2d4d,0x475fa369,0xf5bb7eeb,0x6244ae85} Where the first instances of the vma with the values of 0x666220af and 0x88512fee get printed, but from then on, they are not. That is, from then on, the lookup will return true, and no processing will take place. And periodically, I could clear the hash cache, so that all vmas get printed again. But this would be rate limited to not cause performance issues. -- Steve
On Fri, 29 Aug 2025 at 08:47, Linus Torvalds <torvalds@linux-foundation.org> wrote: > > Anyway, the way to fix this is to not care about lifetimes at all: > just treat the hash as the random number it is, and just accept the > fact that the number gets actively reused and has no meaning. Side note: the actual re-use of various pointers and/or inode numbers is going to be very very random. Classic old filesystems that live by inode numbers will use 'iget5_locked()' and will basically have the same 'struct inode *' pointer too when they re-use an inode number. And they likely also have a very simplistic inode allocation model and a unlink followed by a file creation probably *will* re-use that same inode number. So you can probably see 'struct inode *' get reused quite quickly and reliably for entirely unrelated files just based on file deletion/creation patterns. The dentry pointer will typically stick around rather aggressively, and will likely remain the same when you delete a file and create another one with the same name, and the mnt pointer will stick around too, so the contents of 'struct path' will be the exact same for two completely different files across a delete/create event. So hashing the path is very likely to stay the same as long as the actual path stays the same, but would be fairly insensitive to the underlying data changing. People might not care, particularly with executables and libraries that simply don't get switched around much. And, 'struct file *' will get reused randomly just based on memory allocation issues, but I wouldn't be surprised if a close/open sequence would get the same 'struct file *' pointer. So these will all have various different 'value stays the same, but the underlying data changed' patterns. I really think that you should just treat the hash as a very random number, not assign it *any* meaning at trace collection time, and the more random the better. And then do all the "figure it out" work in user space when *looking* at the traces. It might be a bit more work, and involve a bit more data, but I _think_ it should be very straightforward to just do a "what was the last mmap that had this hash" Linus
On Fri, 29 Aug 2025 09:07:58 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > The dentry pointer will typically stick around rather aggressively, > and will likely remain the same when you delete a file and create > another one with the same name, and the mnt pointer will stick around > too, so the contents of 'struct path' will be the exact same for two > completely different files across a delete/create event. I'm not sure how often a trace would expand the event of running code, deleting the code, recreating it, and running it again. But that means the stack traces of the original code will be useless regardless. But at a minimum, the recreating of the code should trigger another print, and this would give it a different build-id (which I'm not recording as well as the path). > > So hashing the path is very likely to stay the same as long as the > actual path stays the same, but would be fairly insensitive to the > underlying data changing. People might not care, particularly with > executables and libraries that simply don't get switched around much. > > And, 'struct file *' will get reused randomly just based on memory > allocation issues, but I wouldn't be surprised if a close/open > sequence would get the same 'struct file *' pointer. > > So these will all have various different 'value stays the same, but > the underlying data changed' patterns. I really think that you should > just treat the hash as a very random number, not assign it *any* > meaning at trace collection time, and the more random the better. > > And then do all the "figure it out" work in user space when *looking* > at the traces. It might be a bit more work, and involve a bit more > data, but I _think_ it should be very straightforward to just do a > "what was the last mmap that had this hash" I just realized that I'm using the rhashtable as an "does this hash exist". I could get the content of the item that matches the hash and compare it to what was used to create the hash in the first place. If there's a reference counter or some other identifier I could use to know that the passed in vma is the same as what is in the hash table, I can use this to know if the hash needs to be updated with the new information or not. -- Steve
On Fri, 29 Aug 2025 at 09:33, Steven Rostedt <rostedt@goodmis.org> wrote: > > I just realized that I'm using the rhashtable as an "does this hash exist". The question is still *why*? Just use the hash. Don't do anything to it. Don't mess with it. > I could get the content of the item that matches the hash and compare it to > what was used to create the hash in the first place. If there's a reference > counter or some other identifier I could use to know that the passed in vma > is the same as what is in the hash table, I can use this to know if the > hash needs to be updated with the new information or not. No such information exists. Sure, we have reference counts for everything: to a very close approximation, any memory allocation with external visibility has to be reference counted for correctness. But those reference counts are never going to tell you whether they are the *same* object that they were last time you looked at it, or just a new allocation that happens to have the same pointer. Don't even *TRY*. You still haven't explained why you would care. Your patch that used inode numbers didn't care. It just used the numbers. SO JUST USE THE NUMBERS, for chissake! Don't make them mean anything. Don't try to think they mean something. The *reason* I htink hashing 'struct file *' is better than the alternative is exactly that it *cannot* mean anything. It will get re-used quite actively, even when nobody actually changes any of the files. So you are forced to deal with this correctly, even though you seem to be fighting dealing with it correctly tooth and nail. And at no point have you explained why you can't just treat it as meaningless numbers. The patch that started this all did exactly that. It just used the *wrong* numbers, and I pointed out why they were wrong, and why you shouldn't use those numbers. Linus
On Fri, 29 Aug 2025 09:42:03 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Fri, 29 Aug 2025 at 09:33, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > I just realized that I'm using the rhashtable as an "does this hash exist". > > The question is still *why*? The reason is to keep from triggering the event that records the pathname for every look up. > > SO JUST USE THE NUMBERS, for chissake! Don't make them mean anything. > Don't try to think they mean something. > > The *reason* I htink hashing 'struct file *' is better than the > alternative is exactly that it *cannot* mean anything. It will get > re-used quite actively, even when nobody actually changes any of the > files. So you are forced to deal with this correctly, even though you > seem to be fighting dealing with it correctly tooth and nail. > > And at no point have you explained why you can't just treat it as > meaningless numbers. The patch that started this all did exactly that. > It just used the *wrong* numbers, and I pointed out why they were > wrong, and why you shouldn't use those numbers. I agree. The hash I showed last time was just using the pointers. The hash itself is meaningless and is useless by itself. The only thing the hash is doing is to be an identifier in the stack trace so that the path name and buildid don't need to be generated and saved every time. In my other email, I'm thinking of using the pid / vma->vm_start as a key to know if the pathname needs to be printed again or not. Although, perhaps if a task does a dlopen(), load some text and execute it, then a dlclose() and another dlopen() and loads text, that this could break the assumption that the vm_start is unique per file. Just to clarify, the goal of this exercise is to avoid the work of creating and generating the pathnames and buildids for every lookup / stacktrace. Now maybe hashing the pathname isn't as expensive as I think it may be. And just doing that could be "good enough". -- Steve
On Fri, 29 Aug 2025 at 09:57, Steven Rostedt <rostedt@goodmis.org> wrote: > > The reason is to keep from triggering the event that records the pathname > for every look up. BUT THAT WAS NEVER THE POINT. There is only a single 64-bit number. No lookup. No pointer following. No nothing. The whole point of hashing was to get an *opaque* thing very quickly. Not a pathname. No reference counting. No verifying whether you have seen it before. Literally just something that you can match up in the trace file much much later. (And, honestly, the likely thing is that you never match it up at all - you can delay the "match it up" until a human actually looks at a trace, which is presumably going to be a "one in a million" case). Linus
On Fri, 29 Aug 2025 10:02:40 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > (And, honestly, the likely thing is that you never match it up at all > - you can delay the "match it up" until a human actually looks at a > trace, which is presumably going to be a "one in a million" case). Note the use case for this is for tooling that will be using these traces for either flame graphs or for seeing where trouble areas are. That is, if someone is enabling these stack traces, they most definitely will be looked at. Maybe not directly by a human, but the tooling will and it will need the mapping information. -- Steve
On Fri, 29 Aug 2025 at 09:42, Linus Torvalds <torvalds@linux-foundation.org> wrote: > > Just use the hash. Don't do anything to it. Don't mess with it. In fact, at actual stack trace time, don't even do the hashing. Just save the raw pointer value (but as a *value*, not as a pointer: we absolutely do *not* want people to think that the random value can be used as a 'struct file' *: it needs to be a plain unsigned long, not some kernel pointer). Then the hashing can happen when you expose those entries to user space (in the "print" stage). At that point you can do that hash = siphash_1u64(value, secret); thing. That will likely help I$ and D$ too, since you won't be accessing the secret hashing data randomly, but do it only at trace output time (presumably in a fairly tight loop at that point). Linus
On Fri, 29 Aug 2025 09:50:12 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Fri, 29 Aug 2025 at 09:42, Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > > > Just use the hash. Don't do anything to it. Don't mess with it. > > In fact, at actual stack trace time, don't even do the hashing. Just > save the raw pointer value (but as a *value*, not as a pointer: we > absolutely do *not* want people to think that the random value can be > used as a 'struct file' *: it needs to be a plain unsigned long, not > some kernel pointer). > > Then the hashing can happen when you expose those entries to user > space (in the "print" stage). At that point you can do that > > hash = siphash_1u64(value, secret); > > thing. > > That will likely help I$ and D$ too, since you won't be accessing the > secret hashing data randomly, but do it only at trace output time > (presumably in a fairly tight loop at that point). Note, the ring buffer can be mapped to user space. So anything written into the buffer is already exposed. The "at trace output time" is done by user space, not the kernel (except when using "trace" and "trace_pipe" files). -- Steve
On Fri, 29 Aug 2025 at 10:02, Steven Rostedt <rostedt@goodmis.org> wrote: > > Note, the ring buffer can be mapped to user space. So anything written into > the buffer is already exposed. Oh, good point. Yeah, that means that you have to do the hashing immediately. Too bad. Because while 'vma->vm_file' is basically free (since you have to access the vma for other reasons anyway), a good hash isn't. siphash is good and fast for being what it is, but it's not completely free. It's something like 50 shift/xor pairs, and it obviously needs to also access that secret hash value that is likely behind a cache miss.. Still, I suspect it's the best we've got. (If hashing is noticeable, it *might* be worth it to use 'siphash_1u32()' and only hash 32 bits of the pointers. That makes the hashing slightly cheaper, and since the low bits of the pointer will be zero anyway due to alignment, and the high bits don't have a lot of information in them either, it doesn't actually remove much information. You might get collissions if the two pointers are exactly 32 GB apart or whatever, but that sounds really really unlucky) Linus
On August 29, 2025 2:13:33 PM GMT-03:00, Linus Torvalds <torvalds@linux-foundation.org> wrote: >On Fri, 29 Aug 2025 at 10:02, Steven Rostedt <rostedt@goodmis.org> wrote: >> >> Note, the ring buffer can be mapped to user space. So anything written into >> the buffer is already exposed. > >Oh, good point. Yeah, that means that you have to do the hashing >immediately. Too bad. Because while 'vma->vm_file' is basically free >(since you have to access the vma for other reasons anyway), a good >hash isn't. Can't we continue with that idea by using some VMA sequential number that don't expose anything critical to user space but allows us to match stack entries to mmap records? Deferring the heavily lift to when needed is great. - Arnaldo > >siphash is good and fast for being what it is, but it's not completely >free. It's something like 50 shift/xor pairs, and it obviously needs >to also access that secret hash value that is likely behind a cache >miss.. > >Still, I suspect it's the best we've got. > >(If hashing is noticeable, it *might* be worth it to use >'siphash_1u32()' and only hash 32 bits of the pointers. That makes the >hashing slightly cheaper, and since the low bits of the pointer will >be zero anyway due to alignment, and the high bits don't have a lot of >information in them either, it doesn't actually remove much >information. You might get collissions if the two pointers are exactly >32 GB apart or whatever, but that sounds really really unlucky) > > Linus - Arnaldo
On Fri, 29 Aug 2025 at 10:58, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: > > Can't we continue with that idea by using some VMA sequential number that don't expose anything critical to user space but allows us to match stack entries to mmap records? No such record exists. I guess we could add an atomic ID to do_mmap() and have a 64-bit value that would be unique and would follow vma splitting and movement around. But it would actually be worse than just using the 'struct file' pointer, and the only advantage would be avoiding the hashing. And the disadvantages would be many. In particular, it would be much better if it was per-file, but we are *definitely* not adding some globally unique value to each file, because we already have seen performance issues with open/close loads just from the atomic sequence counters we used to give to anonymous inodes. (I say "used to give" - see get_next_ino() for what we do now, with per-cpu counter sequences. We'd have to do similar tricks for some kind of 'file ID', and I really don't want to do that with no reason). And if the only reason is "hashing takes a hundred cycles" when generating traces, that's just not a reason good enough to bloat core kernel data structures like 'struct file' and make core ops like open() more expensive. Linus
[ My last email of the night, as it's our anniversary, and I'm off to dinner now ;-) ] On Thu, 28 Aug 2025 15:10:52 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Thu, 28 Aug 2025 at 14:17, Steven Rostedt <rostedt@kernel.org> wrote: > > > > But that's unique per task, right? What I liked about the f_inode > > pointer, is that it appears to be shared between tasks. > > I actually think the local meaning of the file pointer is an advantage. > > It not only means that you see the difference in mappings of the same > file created with different open calls, it also means that when > different processes mmap the same executable, they don't see the same > hash. > > And because the file pointer doesn't have any long-term meaning, it > also means that you also can't make the mistake of thinking the hash > has a long lifetime. With an inode pointer hash, you could easily have > software bugs that end up not realizing that it's a temporary hash, > and that the same inode *will* get two different hashes if the inode > has been flushed from memory and then loaded anew due to memory > pressure. This is a reasonable argument. But it is still nice to have the same value for all tasks. This is for a "file_cache" that does get flushed regularly (when various changes happen to the tracefs system). It's only purpose is to map the user space stack trace hash value to a path name (and build-id). But yeah, I do not want another file to get flagged with the same hash. > > > I only want to add a new hash and print the path for a new file. If > > several tasks are using the same file (which they are with the > > libraries), then having the hash be the same between tasks would be > > more efficient. > > Why? See above why I think it's a mistake to think those hashes have > lifetimes. They don't. Two different inodes can have the same hash due > to lifetime issues, and the same inode can get two different hashes at > different times for the same reason. > > So you *need* to tie these things to the only lifetime that matters: > the open/close pair (and the mmap - and the stack traces - will be > part of that lifetime). > > I literally think that you are not thinking about this right if you > think you can re-use the hash. I'm just worried about this causing slow downs, especially if I also track the buildid. I did a quick update to the code to first use the f_inode and get the build_id, and it gives: trace-cmd-1012 [003] ...1. 35.247318: inode_cache: hash=0xcb214087 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} trace-cmd-1012 [003] ...1. 35.247333: inode_cache: hash=0x2565194a path=/usr/local/bin/trace-cmd build_id={0x3f399e26,0xf9eb2d4d,0x475fa369,0xf5bb7eeb,0x6244ae85} trace-cmd-1012 [003] ...1. 35.247419: inode_cache: hash=0x22dca920 path=/usr/local/lib64/libtracefs.so.1.8.2 build_id={0x6b040bdb,0x961f23d6,0xc1e1027e,0x7067c348,0xd069fa67} trace-cmd-1012 [003] ...1. 35.247455: inode_cache: hash=0xe87b6ea5 path=/usr/local/lib64/libtraceevent.so.1.8.4 build_id={0x8946b4eb,0xe3bf4ec5,0x11fd7d86,0xcd3105e2,0xe44a8d4d} trace-cmd-1012 [003] ...1. 35.247488: inode_cache: hash=0xafc34117 path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 build_id={0x379dc873,0x32bbdbc4,0x91eeb6cf,0xba549730,0xe2b96c55} bash-1003 [001] ...1. 35.248508: inode_cache: hash=0xcf9bd2d6 path=/usr/bin/bash build_id={0xd94aa36d,0x8e1f19c7,0xa4a69446,0x7338f602,0x20d66357} NetworkManager-581 [004] ...1. 35.703993: inode_cache: hash=0xea1c3e22 path=/usr/sbin/NetworkManager build_id={0x278c6dbb,0x4a1cdde6,0xa1a30a2c,0xbc417464,0x9dfaa28e} bash-1003 [001] ...1. 35.904817: inode_cache: hash=0x133252fa path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5 build_id={0xff2193a5,0xb2ece2f1,0x1bcbd242,0xca302a0b,0xc155fd26} bash-1013 [004] ...1. 37.716435: inode_cache: hash=0x53ae379b path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 build_id={0x4ed9e462,0xb302cd84,0x3ccf0104,0xbd80ac72,0x91c7fd44} bash-1013 [004] ...1. 37.722923: inode_cache: hash=0xa55a259e path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1 build_id={0xc2d9e5b6,0xb211e958,0xdef878e4,0xe4022df,0x9552253} Now I changed it to be the file pointer, and it does give a bit more (see the duplicates): sshd-session-1004 [007] ...1. 98.940058: inode_cache: hash=0x41a6191a path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} trace-cmd-1016 [006] ...1. 98.940089: inode_cache: hash=0xcc38a542 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} trace-cmd-1016 [006] ...1. 98.940109: inode_cache: hash=0xa89cdd4b path=/usr/local/bin/trace-cmd build_id={0x3f399e26,0xf9eb2d4d,0x475fa369,0xf5bb7eeb,0x6244ae85} trace-cmd-1016 [006] ...1. 98.940410: inode_cache: hash=0xb3c570ca path=/usr/local/lib64/libtracefs.so.1.8.2 build_id={0x6b040bdb,0x961f23d6,0xc1e1027e,0x7067c348,0xd069fa67} trace-cmd-1016 [006] ...1. 98.940460: inode_cache: hash=0x4da4af85 path=/usr/local/lib64/libtraceevent.so.1.8.4 build_id={0x8946b4eb,0xe3bf4ec5,0x11fd7d86,0xcd3105e2,0xe44a8d4d} trace-cmd-1016 [006] ...1. 98.940513: inode_cache: hash=0xce16bd9d path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 build_id={0x379dc873,0x32bbdbc4,0x91eeb6cf,0xba549730,0xe2b96c55} bash-1007 [004] ...1. 98.941772: inode_cache: hash=0x772df671 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} bash-1007 [004] ...1. 98.941911: inode_cache: hash=0xdb764962 path=/usr/bin/bash build_id={0xd94aa36d,0x8e1f19c7,0xa4a69446,0x7338f602,0x20d66357} bash-1007 [004] ...1. 100.080299: inode_cache: hash=0xef3bf212 path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5 build_id={0xff2193a5,0xb2ece2f1,0x1bcbd242,0xca302a0b,0xc155fd26} gmain-602 [003] ...1. 100.477235: inode_cache: hash=0xc9205658 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} trace-cmd-1017 [005] ...1. 101.412116: inode_cache: hash=0x5a77751e path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 build_id={0x4ed9e462,0xb302cd84,0x3ccf0104,0xbd80ac72,0x91c7fd44} trace-cmd-1017 [005] ...1. 101.417004: inode_cache: hash=0xf2e95689 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} trace-cmd-1017 [005] ...1. 101.418528: inode_cache: hash=0x5f35d3ca path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 build_id={0x379dc873,0x32bbdbc4,0x91eeb6cf,0xba549730,0xe2b96c55} trace-cmd-1017 [005] ...1. 101.418572: inode_cache: hash=0x57feda78 path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1 build_id={0xc2d9e5b6,0xb211e958,0xdef878e4,0xe4022df,0x9552253} trace-cmd-1017 [005] ...1. 101.418620: inode_cache: hash=0x22ad5d84 path=/usr/local/lib64/libtraceevent.so.1.8.4 build_id={0x8946b4eb,0xe3bf4ec5,0x11fd7d86,0xcd3105e2,0xe44a8d4d} trace-cmd-1017 [005] ...1. 101.418666: inode_cache: hash=0x11c240a6 path=/usr/local/lib64/libtracefs.so.1.8.2 build_id={0x6b040bdb,0x961f23d6,0xc1e1027e,0x7067c348,0xd069fa67} trace-cmd-1017 [005] ...1. 101.418714: inode_cache: hash=0xf4e46cf path=/usr/local/bin/trace-cmd build_id={0x3f399e26,0xf9eb2d4d,0x475fa369,0xf5bb7eeb,0x6244ae85} wpa_supplicant-583 [000] ...1. 102.521195: inode_cache: hash=0xd20a587b path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} trace-cmd-1018 [005] ...1. 102.847910: inode_cache: hash=0xee16ee8e path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} sshd-session-1004 [000] ...1. 102.853561: inode_cache: hash=0x3404c7ea path=/usr/lib/openssh/sshd-session build_id={0x3b119855,0x5b15323e,0xe1ec337a,0xbd49f66e,0x78bddd0f} systemd-udevd-323 [007] ...1. 125.800839: inode_cache: hash=0x760273d5 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} systemd-journal-294 [000] ...1. 125.800932: inode_cache: hash=0x77f34056 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} systemd-udevd-323 [007] ...1. 125.801135: inode_cache: hash=0xe70bd063 path=/usr/lib/x86_64-linux-gnu/systemd/libsystemd-shared-257.so build_id={0x81d9bace,0x59f9953f,0x439928d7,0xe849d513,0xf2103286} systemd-1 [006] ...1. 125.801781: inode_cache: hash=0x42292844 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} systemd-1 [006] ...1. 125.802811: inode_cache: hash=0x2cac8b3b path=/usr/lib/x86_64-linux-gnu/systemd/libsystemd-core-257.so build_id={0x580a80c5,0x931714d2,0xec54d3be,0xd5400bc0,0x6f2530ba} systemd-1 [006] ...1. 125.803740: inode_cache: hash=0xb17acaa6 path=/usr/lib/x86_64-linux-gnu/systemd/libsystemd-shared-257.so build_id={0x81d9bace,0x59f9953f,0x439928d7,0xe849d513,0xf2103286} cron-541 [006] ...1. 138.192640: inode_cache: hash=0x9285db61 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} NetworkManager-581 [005] ...1. 144.716224: inode_cache: hash=0xf3c5bbc1 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} NetworkManager-581 [005] ...1. 144.716392: inode_cache: hash=0x381883bb path=/usr/sbin/NetworkManager build_id={0x278c6dbb,0x4a1cdde6,0xa1a30a2c,0xbc417464,0x9dfaa28e} NetworkManager-581 [005] ...1. 146.385151: inode_cache: hash=0x43451e15 path=/usr/lib/x86_64-linux-gnu/libglib-2.0.so.0.8400.0 build_id={0x9a7d3e29,0x5d8ed8f,0xe399da0,0xb5d373da,0x3ca1049b} chronyd-663 [001] ...1. 157.080405: inode_cache: hash=0xa0db647a path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda} chronyd-663 [001] ...1. 158.152790: inode_cache: hash=0x1c471c4c path=/usr/sbin/chronyd build_id={0xf9588e62,0x3a8e6223,0x619fcb4f,0x12562bb,0x2ea104fb} But maybe it's not enough to be an issue. But this will become more predominate when sframes is built throughout. I only have a few applications having sframes enabled so not every task is getting a full stack trace, and hence, not all the files being touched is being displayed. Just to clarify my concern. I want the stack traces to be quick and small. I believe a 32 bit hash may be enough. And then have a side event that gets updated when new files appear that can display much more information. This side event may be slow which is why I don't want it to occur often. But I do want it to occur for all new files. -- Steve
On August 28, 2025 5:17:18 PM GMT-03:00, Steven Rostedt <rostedt@kernel.org> wrote: >On Thu, 28 Aug 2025 12:18:39 -0700 >Linus Torvalds <torvalds@linux-foundation.org> wrote: > >> On Thu, 28 Aug 2025 at 11:58, Arnaldo Carvalho de Melo >> <arnaldo.melo@gmail.com> wrote: >> > > >> > >Give the damn thing an actual filename or something *useful*, not a >> > >number that user space can't even necessarily match up to anything. >> > >> > A build ID? >> >> I think that's a better thing than the disgusting inode number, yes. > >I don't care what it is. I picked inode/device just because it was the >only thing I saw available. I'm not sure build ID is appropriate either. > >> >> That said, I think they are problematic too, in that I don't think >> they are universally available, so if you want to trace some >> executable without build ids - and there are good reasons to do that - >> you might hate being limited that way. >> >> So I think you'd be much better off with just actual pathnames. > >As you mentioned below, the reason I avoided path names is that they >take up too much of the ring buffer, and would be duplicated all over >the place. I've run this for a while, and it only picked up a couple of >hundred paths while the trace had several thousand stack traces. > >> >> Are there no trace events for "mmap this path"? Create a good u64 hash >> from the contents of a 'struct path' (which is just two pointers: the >> dentry and the mnt) when mmap'ing the file, and then you can just >> associate the stack trace entry with that hash. > >I would love to have a hash to use. The next patch does the mapping of >the inode numbers to their path name. It can The path name is a nice to have detail, but a content based hash is what we want, no? Tracing/profiling has to be about contents of files later used for analysis, and filenames provide no guarantee about that. - Arnaldo easily be switched over to >do the same with a hash number. > >> >> That should be simple and straightforward, and hashing two pointers >> should be simple and straightforward. > >Would a hash of these pointers have any collisions? That would be bad. > >Hmm, I just tried using the pointer to vma->vm_file->f_inode, and that >gives me a unique number. Then I just need to map that back to the path name: > > trace-cmd-1016 [002] ...1. 34.675646: inode_cache: inode=ffff8881007ed428 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libc.so.6 > trace-cmd-1016 [002] ...1. 34.675893: inode_cache: inode=ffff88811970e648 dev=[254:3] path=/usr/local/lib64/libtracefs.so.1.8.2 > trace-cmd-1016 [002] ...1. 34.675933: inode_cache: inode=ffff88811970b8f8 dev=[254:3] path=/usr/local/lib64/libtraceevent.so.1.8.4 > trace-cmd-1016 [002] ...1. 34.675981: inode_cache: inode=ffff888110b78ba8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 > bash-1007 [003] ...1. 34.677316: inode_cache: inode=ffff888103f05d38 dev=[254:3] path=/usr/bin/bash > bash-1007 [003] ...1. 35.432951: inode_cache: inode=ffff888116be94b8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5 > bash-1018 [005] ...1. 36.104543: inode_cache: inode=ffff8881007e9dc8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 > bash-1018 [005] ...1. 36.110407: inode_cache: inode=ffff888110b78298 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1 > bash-1018 [005] ...1. 36.110536: inode_cache: inode=ffff888103d09dc8 dev=[254:3] path=/usr/local/bin/trace-cmd > >I just swapped out the inode with the above (unsigned long)vma->vm_file->f_inode, >and it appears to be unique. > >Thus, I could use that as the "hash" value and then the above could be turned into: > > trace-cmd-1016 [002] ...1. 34.675646: inode_cache: hash=ffff8881007ed428 path=/usr/lib/x86_64-linux-gnu/libc.so.6 > trace-cmd-1016 [002] ...1. 34.675893: inode_cache: hash=ffff88811970e648 path=/usr/local/lib64/libtracefs.so.1.8.2 > trace-cmd-1016 [002] ...1. 34.675933: inode_cache: hash=ffff88811970b8f8 path=/usr/local/lib64/libtraceevent.so.1.8.4 > trace-cmd-1016 [002] ...1. 34.675981: inode_cache: hash=ffff888110b78ba8 path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 > bash-1007 [003] ...1. 34.677316: inode_cache: hash=ffff888103f05d38 path=/usr/bin/bash > bash-1007 [003] ...1. 35.432951: inode_cache: hash=ffff888116be94b8 path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5 > bash-1018 [005] ...1. 36.104543: inode_cache: hash=ffff8881007e9dc8 path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 > bash-1018 [005] ...1. 36.110407: inode_cache: hash=ffff888110b78298 path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1 > bash-1018 [005] ...1. 36.110536: inode_cache: hash=ffff888103d09dc8 path=/usr/local/bin/trace-cmd > >This would mean the readers of the userstacktrace_delay need to also >have this event enabled to do the mappings. But that shouldn't be an >issue. > >-- Steve > - Arnaldo
On Thu, 28 Aug 2025 17:27:37 -0300 Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: > >I would love to have a hash to use. The next patch does the mapping > >of the inode numbers to their path name. It can > > The path name is a nice to have detail, but a content based hash is > what we want, no? > > Tracing/profiling has to be about contents of files later used for > analysis, and filenames provide no guarantee about that. I could add the build id to the inode_cache as well (which I'll rename to file_cache). Thus, the user stack trace will just have the offset and a hash value that will be match the output of the file_cache event which will have the path name and a build id (if one exists). Would that work? -- Steve
On August 28, 2025 5:51:39 PM GMT-03:00, Steven Rostedt <rostedt@kernel.org> wrote: >On Thu, 28 Aug 2025 17:27:37 -0300 >Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: > >> >I would love to have a hash to use. The next patch does the mapping >> >of the inode numbers to their path name. It can >> >> The path name is a nice to have detail, but a content based hash is >> what we want, no? >> >> Tracing/profiling has to be about contents of files later used for >> analysis, and filenames provide no guarantee about that. > >I could add the build id to the inode_cache as well (which I'll rename >to file_cache). > >Thus, the user stack trace will just have the offset and a hash value >that will be match the output of the file_cache event which will have >the path name and a build id (if one exists). > >Would that work? Probably. This "if it is available" question is valid, but since 2016 it's is more of a "did developers disabled it explicitly?" If my "googling" isn't wrong, GNU LD defaults to generating a build ID in ELF images since 2011 and clang's companion since 2016. So making it even more available than what the BPF guys did long ago and perf piggybacked on at some point, by having it cached, on request?, in some 20 bytes alignment hole in task_struct that would be only used when profiling/tracing may be amenable. - Arnaldo
Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> writes: > On August 28, 2025 5:51:39 PM GMT-03:00, Steven Rostedt <rostedt@kernel.org> wrote: >>On Thu, 28 Aug 2025 17:27:37 -0300 >>Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: >> >>> >I would love to have a hash to use. The next patch does the mapping >>> >of the inode numbers to their path name. It can >>> >>> The path name is a nice to have detail, but a content based hash is >>> what we want, no? >>> >>> Tracing/profiling has to be about contents of files later used for >>> analysis, and filenames provide no guarantee about that. >> >>I could add the build id to the inode_cache as well (which I'll rename >>to file_cache). >> >>Thus, the user stack trace will just have the offset and a hash value >>that will be match the output of the file_cache event which will have >>the path name and a build id (if one exists). >> >>Would that work? > > Probably. > > This "if it is available" question is valid, but since 2016 it's is more of a "did developers disabled it explicitly?" > > If my "googling" isn't wrong, GNU LD defaults to generating a build ID in ELF images since 2011 and clang's companion since 2016. GNU ld doesn't ever default to generating build IDs, and I don't *think* LLVM does either (either in Clang, or LLD). GCC, on the other hand, has a configure arg to control this, but it's default-off. Clang generally prefers to have defaults like this done via user/sysadmin specified configuration files rather than adding build-time configure flags. Now, is it a reasonable ask to say "we require build IDs for this feature"? Yeah, it probably is, but it's not default-on right now, and indeed we in Gentoo aren't using them yet (but I'm working on enabling them). > > So making it even more available than what the BPF guys did long ago > and perf piggybacked on at some point, by having it cached, on > request?, in some 20 bytes alignment hole in task_struct that would be > only used when profiling/tracing may be amenable. thanks, sam
On Thu, 28 Aug 2025 18:00:22 -0300 Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: > >Thus, the user stack trace will just have the offset and a hash value > >that will be match the output of the file_cache event which will have > >the path name and a build id (if one exists). > > > >Would that work? > > Probably. > > This "if it is available" question is valid, but since 2016 it's is > more of a "did developers disabled it explicitly?" The "if one exists" comment is that it's not a requirement. If none exists, it would just add a zero. > > If my "googling" isn't wrong, GNU LD defaults to generating a build > ID in ELF images since 2011 and clang's companion since 2016. > > So making it even more available than what the BPF guys did long ago > and perf piggybacked on at some point, by having it cached, on > request?, in some 20 bytes alignment hole in task_struct that would > be only used when profiling/tracing may be amenable. Would perf be interested in this hash file lookup? I know perf is reliant on user space more than ftrace is, and has a lot of work happening in user space while getting stack traces. With ftrace, there's on real user space requirement, thus a lot of the work needs to be done in the kernel. If we go with a hash to file, it's somewhat useless by itself without a way to map the hash to file/buildid. I originally started making this hash->file a file in tracefs. But then I needed to figure out how to manage the allocations. Do I add a "size" for that file and start dropping mappings when it reaches that limit. Then I may need to add a LRU algorithm to do so. I found simply having an event that wrote out the mappings was so much easier to implement. But the file_cache code could be used by perf, where perf does the same and just monitors the file_cache event. I could make the API more global than just the kernel/trace directory. -- Steve
On Thu, 28 Aug 2025 at 13:27, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: > > The path name is a nice to have detail, but a content based hash is what we want, no? No. We want something easy and quick to compute because this is looked up at stacktrace time. That's the primary issue. You can do the mapping to some build-id - if it even exists - later. Linus
On Thu, Aug 28, 2025 at 12:18:39PM -0700, Linus Torvalds wrote: > On Thu, 28 Aug 2025 at 11:58, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote: > > >Give the damn thing an actual filename or something *useful*, not a > > >number that user space can't even necessarily match up to anything. > > A build ID? > I think that's a better thing than the disgusting inode number, yes. > That said, I think they are problematic too, in that I don't think > they are universally available, so if you want to trace some > executable without build ids - and there are good reasons to do that - > you might hate being limited that way. Right, but these days gdb (and other traditional tools) supports it and downloads it (perf should do it with a one-time sticky question too, does it already in some cases, unconditionally, that should be fixed as well), most distros have it: ⬢ [acme@toolbx perf-tools-next]$ file /bin/bash /bin/bash: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=707a1c670cd72f8e55ffedfbe94ea98901b7ce3a, for GNU/Linux 3.2.0, stripped ⬢ [acme@toolbx perf-tools-next]$ We have debuginfod-servers that brings ELF images with debug keyed by that build id and finally build-ids come together with pathnames, so if one is null, fallback to the other. Default in fedora: ⬢ [acme@toolbx perf-tools-next]$ echo $DEBUGINFOD_ $DEBUGINFOD_IMA_CERT_PATH $DEBUGINFOD_URLS ⬢ [acme@toolbx perf-tools-next]$ echo $DEBUGINFOD_ $DEBUGINFOD_IMA_CERT_PATH $DEBUGINFOD_URLS ⬢ [acme@toolbx perf-tools-next]$ echo $DEBUGINFOD_IMA_CERT_PATH /etc/keys/ima: ⬢ [acme@toolbx perf-tools-next]$ echo $DEBUGINFOD_URLS https://debuginfod.fedoraproject.org/ ⬢ [acme@toolbx perf-tools-next]$ I wasn't aware of that IMA stuff. So even without the mandate and with sometimes not being able to get that build-id, most of the time they are there and deterministically allows tooling to fetch it in most cases, I guess that is as far as we can pragmatically get. - Arnaldo > So I think you'd be much better off with just actual pathnames. > > Are there no trace events for "mmap this path"? Create a good u64 hash > from the contents of a 'struct path' (which is just two pointers: the > dentry and the mnt) when mmap'ing the file, and then you can just > associate the stack trace entry with that hash. > > That should be simple and straightforward, and hashing two pointers > should be simple and straightforward. > > And then matching that hash against the mmap event where the actual > path was saved off gives you an actual *pathname*. Which is *so* much > better than those horrific inode numbers. > > And yes, yes, obviously filenames can go away and aren't some kind of > long-term stable thing. But inode numbers can be re-used too, so > that's no different. > > With the "create a hash of 'struct path' contents" you basically have > an ID that can be associated with whatever the file name was at the > time it was mmap'ed into the thing you are tracing, which is I think > what you really want anyway. > > Now, what would be even simpler is to not create a hash at all, but > simply just create the whole pathname when the stack trace entry is > created. But it would probably waste too much space, since you'd > probably want to have at least 32 bytes (as opposed to just 64 bits) > for a (truncated) pathname. > > And it would be more expensive than just hashing the dentry/mnt > pointers, although '%pD' isn't actually *that* expensive. But probably > expensive enough to not really be acceptable. I'm just throwing it out > as a stupid idea that at least generates much more usable output than > the inode numbers do. > > Linus
On Thu, 28 Aug 2025 at 13:04, Arnaldo Carvalho de Melo <acme@kernel.org> wrote: > > > That said, I think they are problematic too, in that I don't think > > they are universally available, so if you want to trace some > > executable without build ids - and there are good reasons to do that - > > you might hate being limited that way. > > Right, but these days gdb (and other traditional tools) supports it and > downloads it (perf should do it with a one-time sticky question too, > does it already in some cases, unconditionally, that should be fixed as > well), most distros have it: So I'm literally thinking one very valid case is debugging some third-party binary with tracing. End result: the whole "most distros have it" is just not relevant to that situation. Linus
On 2025-08-28 14:58, Arnaldo Carvalho de Melo wrote: > > > On August 28, 2025 3:39:35 PM GMT-03:00, Linus Torvalds <torvalds@linux-foundation.org> wrote: >> On Thu, 28 Aug 2025 at 11:05, Steven Rostedt <rostedt@kernel.org> wrote: >>> >>> The deferred user space stacktrace event already does a lookup of the vma >>> for each address in the trace to get the file offset for those addresses, >>> it can also report the file itself. >> >> That sounds like a good idea.. >> >> But the implementation absolutely sucks: >> >>> Add two more arrays to the user space stacktrace event. One for the inode >>> number, and the other to store the device major:minor number. Now the >>> output looks like this: >> >> WTF? Why are you back in the 1960's? What's next? The index into the >> paper card deck? >> >> Stop using inode numbers and device numbers already. It's the 21st >> century. No, cars still don't fly, but dammit, inode numbers were a >> great idea back in the days, but they are not acceptable any more. >> >> They *particularly* aren't acceptable when you apparently think that >> they are 'unsigned long'. Yes, that's the internal representation we >> use for inode indexing, but for example on nfs the inode is actually >> bigger. It's exposed to user space as a u64 through >> >> stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); >> >> so the inode that user space sees in 'struct stat' (a) doesn't >> actually match inode->i_ino, and (b) isn't even the full file ID that >> NFS actually uses. >> >> Let's not let that 60's thinking be any part of a new interface. >> >> Give the damn thing an actual filename or something *useful*, not a >> number that user space can't even necessarily match up to anything. >> > > A build ID? > > PERF_RECORD_MMAP went thru this, filename -> inode -> Content based hash FWIW, we record: - executable or shared library path name, - build id (if available), - debug link (if available), in LTTng-UST when we dump the loaded executable and libraries from userspace. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
© 2016 - 2025 Red Hat, Inc.