[v6] tracing: Deferred unwinding of user space stack traces

[PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

From: Steven Rostedt <rostedt@goodmis.org>

The deferred user space stacktrace event already does a lookup of the vma
for each address in the trace to get the file offset for those addresses,
it can also report the file itself.

Add two more arrays to the user space stacktrace event. One for the inode
number, and the other to store the device major:minor number. Now the
output looks like this:

       trace-cmd-1108    [007] .....   240.253487: <user stack unwind>
cookie=7000000000009
 =>  <00000000001001ca> : 1340007 : 254:3
 =>  <000000000000ae27> : 1308548 : 254:3
 =>  <00000000000107e7> : 1440347 : 254:3
 =>  <00000000000109d3> : 1440347 : 254:3
 =>  <0000000000011523> : 1440347 : 254:3
 =>  <000000000001f79d> : 1440347 : 254:3
 =>  <000000000001fb01> : 1440347 : 254:3
 =>  <000000000001fbc0> : 1440347 : 254:3
 =>  <000000000000eb7e> : 1440347 : 254:3
 =>  <0000000000029ca8> : 1340007 : 254:3

Use space tooling can use this information to get the actual functions
from the files.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v5: https://lore.kernel.org/20250424192613.869730948@goodmis.org

- Set inode to -1L if vma is not found for that address to let user space
  know that, and differentiate from a vdso section.

 kernel/trace/trace.c         | 26 +++++++++++++++++++++++++-
 kernel/trace/trace_entries.h |  8 ++++++--
 kernel/trace/trace_output.c  | 27 +++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3e9ef644dd64..c6e1471e4615 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3140,6 +3140,8 @@ static void trace_user_unwind_callback(struct unwind_work *unwind,
 	unsigned int trace_ctx;
 	struct vm_area_struct *vma = NULL;
 	unsigned long *caller;
+	unsigned long *inodes;
+	unsigned int *devs;
 	unsigned int offset;
 	int len;
 	int i;
@@ -3151,7 +3153,8 @@ static void trace_user_unwind_callback(struct unwind_work *unwind,
 	if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE_DELAY))
 		return;
 
-	len = trace->nr * sizeof(unsigned long) + sizeof(*entry);
+	len = trace->nr * (sizeof(unsigned long) * 2 + sizeof(unsigned int))
+			   + sizeof(*entry);
 
 	trace_ctx = tracing_gen_ctx();
 
@@ -3172,6 +3175,15 @@ static void trace_user_unwind_callback(struct unwind_work *unwind,
 	entry->__data_loc_stack = offset | (len << 16);
 	caller = (void *)entry + offset;
 
+	offset += len;
+	entry->__data_loc_inodes = offset | (len << 16);
+	inodes = (void *)entry + offset;
+
+	offset += len;
+	len = sizeof(unsigned int) * trace->nr;
+	entry->__data_loc_dev = offset | (len << 16);
+	devs = (void *)entry + offset;
+
 	for (i = 0; i < trace->nr; i++) {
 		unsigned long addr = trace->entries[i];
 
@@ -3180,9 +3192,21 @@ static void trace_user_unwind_callback(struct unwind_work *unwind,
 
 		if (!vma) {
 			caller[i] = addr;
+			/* Use -1 to denote no vma found */
+			inodes[i] = -1L;
+			devs[i] = 0;
 			continue;
 		}
+
 		caller[i] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
+
+		if (vma->vm_file && vma->vm_file->f_inode) {
+			inodes[i] = vma->vm_file->f_inode->i_ino;
+			devs[i] = vma->vm_file->f_inode->i_sb->s_dev;
+		} else {
+			inodes[i] = 0;
+			devs[i] = 0;
+		}
 	}
 
 	__buffer_unlock_commit(buffer, event);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 40dc53ead0a8..5f7b72359901 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -256,10 +256,14 @@ FTRACE_ENTRY(user_unwind_stack, userunwind_stack_entry,
 	F_STRUCT(
 		__field(		u64,		cookie	)
 		__dynamic_array(	unsigned long,	stack	)
+		__dynamic_array(	unsigned long,	inodes	)
+		__dynamic_array(	unsigned int,	dev	)
 	),
 
-	F_printk("cookie=%lld\n%s", __entry->cookie,
-		 __print_dynamic_array(stack, sizeof(unsigned long)))
+	F_printk("cookie=%lld\n%s%s%s", __entry->cookie,
+		 __print_dynamic_array(stack, sizeof(unsigned long)),
+		 __print_dynamic_array(inodes, sizeof(unsigned long)),
+		 __print_dynamic_array(dev, sizeof(unsigned long)))
 );
 
 FTRACE_ENTRY(user_unwind_cookie, userunwind_cookie_entry,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 9489537533f7..437e5f23b73d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1410,9 +1410,13 @@ static enum print_line_t trace_user_unwind_stack_print(struct trace_iterator *it
 	struct userunwind_stack_entry *field;
 	struct trace_seq *s = &iter->seq;
 	unsigned long *caller;
+	unsigned long *inodes;
+	unsigned int *devs;
 	unsigned int offset;
 	unsigned int len;
 	unsigned int caller_cnt;
+	unsigned int inode_cnt;
+	unsigned int dev_cnt;
 	unsigned int i;
 
 	trace_assign_type(field, iter->ent);
@@ -1429,6 +1433,21 @@ static enum print_line_t trace_user_unwind_stack_print(struct trace_iterator *it
 
 	caller = (void *)iter->ent + offset;
 
+	/* The inodes and devices are also dynamic pointers */
+	offset = field->__data_loc_inodes;
+	len = offset >> 16;
+	offset = offset & 0xffff;
+	inode_cnt = len / sizeof(*inodes);
+
+	inodes = (void *)iter->ent + offset;
+
+	offset = field->__data_loc_dev;
+	len = offset >> 16;
+	offset = offset & 0xffff;
+	dev_cnt = len / sizeof(*devs);
+
+	devs = (void *)iter->ent + offset;
+
 	for (i = 0; i < caller_cnt; i++) {
 		unsigned long ip = caller[i];
 
@@ -1437,6 +1456,14 @@ static enum print_line_t trace_user_unwind_stack_print(struct trace_iterator *it
 
 		trace_seq_puts(s, " => ");
 		seq_print_user_ip(s, NULL, ip, flags);
+
+		if (i < inode_cnt) {
+			trace_seq_printf(s, " : %ld", inodes[i]);
+			if (i < dev_cnt) {
+				trace_seq_printf(s, " : %d:%d",
+						 MAJOR(devs[i]), MINOR(devs[i]));
+			}
+		}
 		trace_seq_putc(s, '\n');
 	}
 
-- 
2.50.1

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Thu, 28 Aug 2025 at 11:05, Steven Rostedt <rostedt@kernel.org> wrote:
>
> The deferred user space stacktrace event already does a lookup of the vma
> for each address in the trace to get the file offset for those addresses,
> it can also report the file itself.

That sounds like a good idea..

But the implementation absolutely sucks:

> Add two more arrays to the user space stacktrace event. One for the inode
> number, and the other to store the device major:minor number. Now the
> output looks like this:

WTF? Why are you back in the 1960's? What's next? The index into the
paper card deck?

Stop using inode numbers and device numbers already. It's the 21st
century. No, cars still don't fly, but dammit, inode numbers were a
great idea back in the days, but they are not acceptable any more.

They *particularly* aren't acceptable when you apparently think that
they are 'unsigned long'.  Yes, that's the internal representation we
use for inode indexing, but for example on nfs the inode is actually
bigger. It's exposed to user space as a u64 through

        stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));

so the inode that user space sees in 'struct stat' (a) doesn't
actually match inode->i_ino, and (b) isn't even the full file ID that
NFS actually uses.

Let's not let that 60's thinking be any part of a new interface.

Give the damn thing an actual filename or something *useful*, not a
number that user space can't even necessarily match up to anything.

              Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Arnaldo Carvalho de Melo 1 month ago


On August 28, 2025 3:39:35 PM GMT-03:00, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>On Thu, 28 Aug 2025 at 11:05, Steven Rostedt <rostedt@kernel.org> wrote:
>>
>> The deferred user space stacktrace event already does a lookup of the vma
>> for each address in the trace to get the file offset for those addresses,
>> it can also report the file itself.
>
>That sounds like a good idea..
>
>But the implementation absolutely sucks:
>
>> Add two more arrays to the user space stacktrace event. One for the inode
>> number, and the other to store the device major:minor number. Now the
>> output looks like this:
>
>WTF? Why are you back in the 1960's? What's next? The index into the
>paper card deck?
>
>Stop using inode numbers and device numbers already. It's the 21st
>century. No, cars still don't fly, but dammit, inode numbers were a
>great idea back in the days, but they are not acceptable any more.
>
>They *particularly* aren't acceptable when you apparently think that
>they are 'unsigned long'.  Yes, that's the internal representation we
>use for inode indexing, but for example on nfs the inode is actually
>bigger. It's exposed to user space as a u64 through
>
>        stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>
>so the inode that user space sees in 'struct stat' (a) doesn't
>actually match inode->i_ino, and (b) isn't even the full file ID that
>NFS actually uses.
>
>Let's not let that 60's thinking be any part of a new interface.
>
>Give the damn thing an actual filename or something *useful*, not a
>number that user space can't even necessarily match up to anything.
>

A build ID?

PERF_RECORD_MMAP went thru this, filename ->  inode -> Content based hash

- Arnaldo

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Thu, 28 Aug 2025 at 11:58, Arnaldo Carvalho de Melo
<arnaldo.melo@gmail.com> wrote:
> >
> >Give the damn thing an actual filename or something *useful*, not a
> >number that user space can't even necessarily match up to anything.
>
> A build ID?

I think that's a better thing than the disgusting inode number, yes.

That said, I think they are problematic too, in that I don't think
they are universally available, so if you want to trace some
executable without build ids - and there are good reasons to do that -
you might hate being limited that way.

So I think you'd be much better off with just actual pathnames.

Are there no trace events for "mmap this path"? Create a good u64 hash
from the contents of a 'struct path' (which is just two pointers: the
dentry and the mnt) when mmap'ing the file, and then you can just
associate the stack trace entry with that hash.

That should be simple and straightforward, and hashing two pointers
should be simple and straightforward.

And then matching that hash against the mmap event where the actual
path was saved off gives you an actual *pathname*. Which is *so* much
better than those horrific inode numbers.

And yes, yes, obviously filenames can go away and aren't some kind of
long-term stable thing. But inode numbers can be re-used too, so
that's no different.

With the "create a hash of 'struct path' contents" you basically have
an ID that can be associated with whatever the file name was at the
time it was mmap'ed into the thing you are tracing, which is I think
what you really want anyway.

Now, what would be even simpler is to not create a hash at all, but
simply just create the whole pathname when the stack trace entry is
created. But it would probably waste too much space, since you'd
probably want to have at least 32 bytes (as opposed to just 64 bits)
for a (truncated) pathname.

And it would be more expensive than just hashing the dentry/mnt
pointers, although '%pD' isn't actually *that* expensive. But probably
expensive enough to not really be acceptable. I'm just throwing it out
as a stupid idea that at least generates much more usable output than
the inode numbers do.

          Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Thu, 28 Aug 2025 12:18:39 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, 28 Aug 2025 at 11:58, Arnaldo Carvalho de Melo
> <arnaldo.melo@gmail.com> wrote:
> > >
> > >Give the damn thing an actual filename or something *useful*, not a
> > >number that user space can't even necessarily match up to anything.  
> >
> > A build ID?  
> 
> I think that's a better thing than the disgusting inode number, yes.

I don't care what it is. I picked inode/device just because it was the
only thing I saw available. I'm not sure build ID is appropriate either.

> 
> That said, I think they are problematic too, in that I don't think
> they are universally available, so if you want to trace some
> executable without build ids - and there are good reasons to do that -
> you might hate being limited that way.
> 
> So I think you'd be much better off with just actual pathnames.

As you mentioned below, the reason I avoided path names is that they
take up too much of the ring buffer, and would be duplicated all over
the place. I've run this for a while, and it only picked up a couple of
hundred paths while the trace had several thousand stack traces.

> 
> Are there no trace events for "mmap this path"? Create a good u64 hash
> from the contents of a 'struct path' (which is just two pointers: the
> dentry and the mnt) when mmap'ing the file, and then you can just
> associate the stack trace entry with that hash.

I would love to have a hash to use. The next patch does the mapping of
the inode numbers to their path name. It can easily be switched over to
do the same with a hash number.

> 
> That should be simple and straightforward, and hashing two pointers
> should be simple and straightforward.

Would a hash of these pointers have any collisions? That would be bad.

Hmm, I just tried using the pointer to vma->vm_file->f_inode, and that
gives me a unique number. Then I just need to map that back to the path name:

       trace-cmd-1016    [002] ...1.    34.675646: inode_cache: inode=ffff8881007ed428 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libc.so.6
       trace-cmd-1016    [002] ...1.    34.675893: inode_cache: inode=ffff88811970e648 dev=[254:3] path=/usr/local/lib64/libtracefs.so.1.8.2
       trace-cmd-1016    [002] ...1.    34.675933: inode_cache: inode=ffff88811970b8f8 dev=[254:3] path=/usr/local/lib64/libtraceevent.so.1.8.4
       trace-cmd-1016    [002] ...1.    34.675981: inode_cache: inode=ffff888110b78ba8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7
            bash-1007    [003] ...1.    34.677316: inode_cache: inode=ffff888103f05d38 dev=[254:3] path=/usr/bin/bash
            bash-1007    [003] ...1.    35.432951: inode_cache: inode=ffff888116be94b8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5
            bash-1018    [005] ...1.    36.104543: inode_cache: inode=ffff8881007e9dc8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
            bash-1018    [005] ...1.    36.110407: inode_cache: inode=ffff888110b78298 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1
            bash-1018    [005] ...1.    36.110536: inode_cache: inode=ffff888103d09dc8 dev=[254:3] path=/usr/local/bin/trace-cmd

I just swapped out the inode with the above (unsigned long)vma->vm_file->f_inode,
and it appears to be unique.

Thus, I could use that as the "hash" value and then the above could be turned into:

       trace-cmd-1016    [002] ...1.    34.675646: inode_cache: hash=ffff8881007ed428 path=/usr/lib/x86_64-linux-gnu/libc.so.6
       trace-cmd-1016    [002] ...1.    34.675893: inode_cache: hash=ffff88811970e648 path=/usr/local/lib64/libtracefs.so.1.8.2
       trace-cmd-1016    [002] ...1.    34.675933: inode_cache: hash=ffff88811970b8f8 path=/usr/local/lib64/libtraceevent.so.1.8.4
       trace-cmd-1016    [002] ...1.    34.675981: inode_cache: hash=ffff888110b78ba8 path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7
            bash-1007    [003] ...1.    34.677316: inode_cache: hash=ffff888103f05d38 path=/usr/bin/bash
            bash-1007    [003] ...1.    35.432951: inode_cache: hash=ffff888116be94b8 path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5
            bash-1018    [005] ...1.    36.104543: inode_cache: hash=ffff8881007e9dc8 path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
            bash-1018    [005] ...1.    36.110407: inode_cache: hash=ffff888110b78298 path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1
            bash-1018    [005] ...1.    36.110536: inode_cache: hash=ffff888103d09dc8 path=/usr/local/bin/trace-cmd

This would mean the readers of the userstacktrace_delay need to also
have this event enabled to do the mappings. But that shouldn't be an
issue.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Thu, 28 Aug 2025 at 13:17, Steven Rostedt <rostedt@kernel.org> wrote:
>
> >
> > That should be simple and straightforward, and hashing two pointers
> > should be simple and straightforward.
>
> Would a hash of these pointers have any collisions? That would be bad.

What? Collisions in 64 bits when you have a handful of cases around?
Not an issue unless you picked your hash to be something ridiculous.

               Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Thu, 28 Aug 2025 13:38:33 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, 28 Aug 2025 at 13:17, Steven Rostedt <rostedt@kernel.org> wrote:
> >  
> > >
> > > That should be simple and straightforward, and hashing two pointers
> > > should be simple and straightforward.  
> >
> > Would a hash of these pointers have any collisions? That would be bad.  
> 
> What? Collisions in 64 bits when you have a handful of cases around?
> Not an issue unless you picked your hash to be something ridiculous.
> 

Since I only need a unique identifier, and it appears that the
vma->vm_file->f_inode pointer is unique, would just using that be OK?

I could run it through the same hash algorithm that "%p" goes through so
that it's not a real memory address.

As getting to the path does require some more logic to get to. Not to
mention, this may later need to handle JIT code (and we'll need a way
to map to that too).

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Thu, 28 Aug 2025 at 13:48, Steven Rostedt <rostedt@kernel.org> wrote:
>
> I could run it through the same hash algorithm that "%p" goes through so
> that it's not a real memory address.

For '%p', people can't easily trigger lots of different cases, and you
can't force kernel printouts from user space.

For something like tracing, user space *does* control the output, and
you shouldn't give people visibility into the hashing that '%p' does.

So you can certainly use siphash for hashing, but make sure to not use
the same secret key that the printing does.

As to the ID to hash, I actually think a 'struct file *' might be the
best thing to use - that's directly in the vma, no need to follow any
other pointers for it.

               Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Thu, 28 Aug 2025 14:06:39 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:
> So you can certainly use siphash for hashing, but make sure to not use
> the same secret key that the printing does.

Right, I just meant to use the same algorithm. The key would be different.

> 
> As to the ID to hash, I actually think a 'struct file *' might be the
> best thing to use - that's directly in the vma, no need to follow any
> other pointers for it.

But that's unique per task, right? What I liked about the f_inode
pointer, is that it appears to be shared between tasks.

I only want to add a new hash and print the path for a new file. If
several tasks are using the same file (which they are with the
libraries), then having the hash be the same between tasks would be
more efficient.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Thu, 28 Aug 2025 at 14:17, Steven Rostedt <rostedt@kernel.org> wrote:
>
> But that's unique per task, right? What I liked about the f_inode
> pointer, is that it appears to be shared between tasks.

I actually think the local meaning of the file pointer is an advantage.

It not only means that you see the difference in mappings of the same
file created with different open calls, it also means that when
different processes mmap the same executable, they don't see the same
hash.

And because the file pointer doesn't have any long-term meaning, it
also means that you also can't make the mistake of thinking the hash
has a long lifetime. With an inode pointer hash, you could easily have
software bugs that end up not realizing that it's a temporary hash,
and that the same inode *will* get two different hashes if the inode
has been flushed from memory and then loaded anew due to memory
pressure.

> I only want to add a new hash and print the path for a new file. If
> several tasks are using the same file (which they are with the
> libraries), then having the hash be the same between tasks would be
> more efficient.

Why? See above why I think it's a mistake to think those hashes have
lifetimes. They don't. Two different inodes can have the same hash due
to lifetime issues, and the same inode can get two different hashes at
different times for the same reason.

So you *need* to tie these things to the only lifetime that matters:
the open/close pair (and the mmap - and the stack traces - will be
part of that lifetime).

I literally think that you are not thinking about this right if you
think you can re-use the hash.

             Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Thu, 28 Aug 2025 15:10:52 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> And because the file pointer doesn't have any long-term meaning, it
> also means that you also can't make the mistake of thinking the hash
> has a long lifetime. With an inode pointer hash, you could easily have
> software bugs that end up not realizing that it's a temporary hash,
> and that the same inode *will* get two different hashes if the inode
> has been flushed from memory and then loaded anew due to memory
> pressure.

The hash value can actually last longer than the file pointer. Thus, if we
use the file pointer for the hash, then we could risk it getting freed and
then allocated again for different file. Then we get the same hash value
for two different paths.

What I'm looking at doing is using both the file pointer as well as its
path to make the hash:

struct jhash_key {
	void		*file;
	struct path	path;
};

u32 trace_file_cache_add(struct vm_area_struct *vma)
{
	[..]
	static u32 initval;
	u32 hash;

	if (!vma->vm_file)
		return 0;

	if (!initval)
		get_random_bytes(&initval, sizeof(initval));

	jkey.file = vma->vm_file;
	jkey.path = vma->vm_file->f_path;

	hash = jhash(&jkey, sizeof(jkey), initval);

	if (!trace_file_cache_enabled())
		return hash;

	[ add the hash to the rhashtable and print if unique ]

Hopefully by using both the file pointer and its path to create the hash,
it will stay unique for some time.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 08:06, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> The hash value can actually last longer than the file pointer. Thus, if we
> use the file pointer for the hash, then we could risk it getting freed and
> then allocated again for different file. Then we get the same hash value
> for two different paths.

No, no no.

That doesn't mean that the hash "lasts longer" than the file pointer.
Quite the reverse.

It is literally about the fact that YOU HAVE TO TAKE LIFETIMES INTO ACCOUNT.

So you are being confused, and that shows in your "solution".

And the thing is, the "you have to take lifetimes into account' is
true *regardless* of what you use as your index. It was true even with
inode numbers and major/minor numbers, in that file deletion and
creation would basically end up reusing the same "hash".

And this is my *point*: the advantage of the 'struct file *' is that
it is a local thing that gets reused potentially quite quickly, and
*forces* you to get the lifetime right.

So don't mess that up.

Except you do, and suggest this instead:

> What I'm looking at doing is using both the file pointer as well as its
> path to make the hash:

NO NO NO.

Now you are only saying "ok, I have a bogus lifetime, so I'll make a
hash where that isn't obvious any more because reuse is harder to
trigger".

IOW: YOU ARE MAKING THE BUG WORSE BY HIDING IT.

You're not fixing anything at all. You are literally making it obvious
that your design is bogus and you're not thinking things through.

So stop it. Really.

Instead, realize that *ANY* hash you use has a limited lifetime, and
the *ONLY* validity of that random number - whether it's a hash of the
file pointer, an inode number, or anything else - is DURING THE
MAPPING THAT IT USES.

As long as the mapping exists, you know the thing is stable, because
the mapping has a reference to the file (which has a reference to the
path, which has a reference to the inode - so it all stays consistent
and stable).

But *immediately* when the mapping goes away, it's now no longer valid
to think it has some meaning any more. Really. It might be a temporary
file that was already unlinked, and the 'munmap()' is the last thing
that releases the inode and it gets deleted from disk, and a new inode
is created with the exact same inode number, and maybe even the exact
same 'struct inode *' pointer.

And as long as you don't understand this, you will always get this
wrong, and you'll create bogus "workarounds" that just hide the REAL
bug. Bogus workarounds like making a random different hash that is
just less likely to show your mistake.

In other words, to get this right, you *have* to associate the hash
with the mmap creation that precedes it in the trace. You MUST NOT
reuse it, not unless you also have some kind of reference count model
that keeps track of how many mmap's that hash is associated with.

Put another way: the only valid way to reuse it is if you manually
track the lifetime of it. Anything else is WRONG.

Now, tracking the actual lifetime of the hash is probably doable, but
it's complex and error-prone. You can't do it by using the reference
count in the 'struct file' itself, because that would keep the
lifetime of the file artificially elevated, so you'd have to do some
entirely separate thing that tracks things. Don't do it.

Anyway, the way to fix this is to not care about lifetimes at all:
just treat the hash as the random number it is, and just accept the
fact that the number gets actively reused and has no meaning.

Instead, just make sure that when you *use* the hash in user space,
you always associate the hash with the previous trace event for the
mmap that used that hash.

You need to look up the event anyway to figure out what the hash means.

And this is where the whole "short lifetime" is so important. It's
what *forces* you to get this right, instead of doing the wrong thing
and thinking that hashes have lifetimes that they simply do not have.

The number in the stack trace - regardless of what that number is - is
*ONLY* valid if you associate it with the last mmap that created that
number.

You don't even have to care about the unmap event, because that unmap
- while it will potentially kill the lifetime of the hash if it was
the last use of that file - also means that now there won't be any new
stack traces with that hash any more. So you can ignore the lifetime
in that respect: all that matters is that yes, it can get re-used, but
you'll see a new mmap event with that hash if it is.

(And then you might still have the complexity with per-cpu trace
buffers etc where the ordering between an mmap event on one CPU might
not be entirely obvious wrt the stack trace even on another CPU with a
different thread that shares the same VM, but that's no different from
any of the other percpu trace buffer ordering issues).

                 Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 08:47:44 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> You don't even have to care about the unmap event, because that unmap
> - while it will potentially kill the lifetime of the hash if it was
> the last use of that file - also means that now there won't be any new
> stack traces with that hash any more. So you can ignore the lifetime
> in that respect: all that matters is that yes, it can get re-used, but
> you'll see a new mmap event with that hash if it is.

Basically what I need is that every time I add a file/hash mapping to the
hashtable, I really need a callback to know when that file goes away. And
then I can remove it from the hash table, so that the next time that hash is
added, it will trigger another "print the file associated with this hash".

It's OK to have the same hash for multiple files as long as it is traced.

All events have timestamps associated to them, so it is trivial to map
which hash mapping belongs to which stack trace.

The reason for the file_cache is to keep from having to do the lookup of
the file at every stack trace. But if I can have a callback for when that
vma gets changed, (as I'm assuming the file will last as long as the vma is
unmodified), then the callback could remove the hash value and this would
not be a problem.

My question now is, is there a callback that can be registered by the
file_cache to know when the vma or the file change?

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 09:18, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> Basically what I need is that every time I add a file/hash mapping to the
> hashtable, I really need a callback to know when that file goes away. And
> then I can remove it from the hash table, so that the next time that hash is
> added, it will trigger another "print the file associated with this hash".

That works, but why would you care?

Why don't you just register the hash value and NOT CARE.

Leave it all to later when the trace gets analyzed.

Leave it be. The normal situation is presumably going to be that
millions of stack traces will be generated, and nobody will even look
at them.

> My question now is, is there a callback that can be registered by the
> file_cache to know when the vma or the file change?

No. And what's the point? I just told you that unmap doesn't matter.
All that matters is mmap.

Don't try to "reuse" hashes. Just treat them as opaque numbers.

             Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 09:28:41 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:


> Don't try to "reuse" hashes. Just treat them as opaque numbers.

What do I use to make the hash?

One thing this is trying to do is not have to look up the path name for
every line of a stack trace.

I could just have every instance do the full look up, make a hash from the
path name and build id, and pass the hash to the caller. My worry is the
time it takes to generate that.

Perhaps I could have a hash that maps the pid with the vma->vm_start, and
if that's unique, get the path and build-id and create the hash for that
and send it back to the user. Save the hash for that mapping in the
rhashtable with the pid/vm_start as the key.

Then the code that adds the vma, will see if the pid/vma->start exists, if
it does, return the hash associated with that, if it does not, add it and
trigger the event that a new address has been created.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 09:49, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> What do I use to make the hash?

Literally just '(unsigned long)(vma->vm_file)'.

Nothing else.

> One thing this is trying to do is not have to look up the path name for
> every line of a stack trace.

That's the *opposite* of what I've been suggesting. I've literally
been talking about just saving off the hash of the file pointer.

(And I just suggested that what you actually save off isn't even the
hash - just the value - and that you can hash it later at a less
critical point in time)

Don't do *any* work at all at trace collection time. All you need is
to literally access three fields in the 'vma':

 - 'vm_start' and 'vm_pgoff' are needed to calculate the offset in the
file using the user space address

 - save off the value of 'vm_file' for later hashing

and I really think you're done.

Then, for the actual trace, you need two things:

 - you need the mmap trace event that has the 'file' value, and you
create a mmap event with that value hashed, and at that point you also
output the pathname and/or things like the build ID

 - for the stack trace events, you output the offset in the file, and
you hash and output the file value

now, in user space, you have all you need. All you do is match the
hashes. They are random numbers, and user space cannot know what they
are. They are just cookies as a mapping ID.

And look, now you have the pathname and the build ID - or whatever you
saved off in that mmap event. And at stack trace time, you needed to
do *nothing*.

And mmap is rare enough - and heavy enough - that doing that pathname
and build ID at *that* point is a non-issue.

See what I'm trying to say?

               Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Arnaldo Carvalho de Melo 1 month ago


On August 29, 2025 1:59:21 PM GMT-03:00, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>On Fri, 29 Aug 2025 at 09:49, Steven Rostedt <rostedt@goodmis.org> wrote:
>>
>> What do I use to make the hash?
>
>Literally just '(unsigned long)(vma->vm_file)'.
>
>Nothing else.
>
>> One thing this is trying to do is not have to look up the path name for
>> every line of a stack trace.
>
>That's the *opposite* of what I've been suggesting. I've literally
>been talking about just saving off the hash of the file pointer.
>
>(And I just suggested that what you actually save off isn't even the
>hash - just the value - and that you can hash it later at a less
>critical point in time)
>
>Don't do *any* work at all at trace collection time. All you need is
>to literally access three fields in the 'vma':
>
> - 'vm_start' and 'vm_pgoff' are needed to calculate the offset in the
>file using the user space address
>
> - save off the value of 'vm_file' for later hashing
>
>and I really think you're done.
>
>Then, for the actual trace, you need two things:
>
> - you need the mmap trace event that has the 'file' value, and you
>create a mmap event with that value hashed, and at that point you also
>output the pathname and/or things like the build ID
>
> - for the stack trace events, you output the offset in the file, and
>you hash and output the file value
>
>now, in user space, you have all you need. All you do is match the
>hashes. They are random numbers, and user space cannot know what they
>are. They are just cookies as a mapping ID.
>
>And look, now you have the pathname and the build ID - or whatever you
>saved off in that mmap event. And at stack trace time, you needed to
>do *nothing*.
>
>And mmap is rare enough - and heavy enough - that doing that pathname
>and build ID at *that* point is a non-issue.


Or using a preexisting one in the DSO used for the executable mmap.

As long as we don't lose those mmap events due to memory pressure/lost events and we have timestamps to order it all before lookups, yeah should work.

- Arnaldo 

>
>See what I'm trying to say?
>
>               Linus

- Arnaldo

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 10:18, Arnaldo Carvalho de Melo
<arnaldo.melo@gmail.com> wrote:
>
> As long as we don't lose those mmap events due to memory pressure/lost
> events and we have timestamps to order it all before lookups, yeah
> should work.

The main reason to lose mmap events that I can see is that you start
tracing in the middle of running something (for example, tracing
systemd or some other "started at boot" thing).

Then you'd not have any record of an actual mmap at all because it
happened before you started tracing, even if there is no memory
pressure or other thing going on.

That is not necessarily a show-stopper: you could have some fairly
simple count for "how many times have I seen this hash", and add a
"mmap reminder" event (which would just be the exact same thing as the
regular mmap event).

You do it for the first time you see it, and every N times afterwards
(maybe by simply using a counter array that is indexed by the low bits
of the hash, and incrementing it for every hash you see, and if it was
zero modulo N you do that "mmap reminder" thing).

Yes, at that point you'd need to do that whole "generate path and
build ID", but if 'N' is a large enough number, it's pretty rare.
Maybe using a 16-bit counter would be sufficient (ie N would naturally
be 65536 when it becomes zero again).

That might be a good thing regardless just to have some guaranteed
limit of how far back in the trace you need to go to find the mmap
information for some hash. If you have long traces, maybe you don't
want to walk back billions of events.

But I wouldn't suggest doing that as a *first* implementation. I'm
just saying that it could be added if people find that it's a problem.

            Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 10:33:38 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, 29 Aug 2025 at 10:18, Arnaldo Carvalho de Melo
> <arnaldo.melo@gmail.com> wrote:
> >
> > As long as we don't lose those mmap events due to memory pressure/lost
> > events and we have timestamps to order it all before lookups, yeah
> > should work.  
> 
> The main reason to lose mmap events that I can see is that you start
> tracing in the middle of running something (for example, tracing
> systemd or some other "started at boot" thing).

Note, for on-demand tracing, the applications are already running before
the tracing starts. That is actually the common case. Yes, people do often
"enabled tracing, run my code, stop tracing", but most of the use cases I
deal with, it's (we are noticing something in the field, start tracing,
issue gets hit, stop tracing), where the applications we are monitoring are
already running when the tracing started. Just tracing the mmap when it
happens will not be useful for us.

Not to mention, in the future, this will also have to work with JIT. I was
thinking of using 64 bit hashes in the stack trace, where the top bits are
reserved for context (is this a file, or something dynamically created).

> 
> Then you'd not have any record of an actual mmap at all because it
> happened before you started tracing, even if there is no memory
> pressure or other thing going on.
> 
> That is not necessarily a show-stopper: you could have some fairly
> simple count for "how many times have I seen this hash", and add a
> "mmap reminder" event (which would just be the exact same thing as the
> regular mmap event).

I thought about clearing the file cache periodically, if for any other
reason, but for dropped events where the mapping is lost.

This is why I'm looking at clearing on "unmap". Yes, we don't care about
unmap, but as soon as an unmap happens if that value gets used again then
we know it's a new mapping. That is, dropped the hashes out of the file
cache when they are no longer around.

The idea is this (pseudo code):

 user_stack_trace() {
   foreach vma in each stack frame:
       key = hash(vma->vm_file);
       if (!lookup(key)) {
           trace_file_map(key, generate_path(vma), generate_buildid(vma));
           add_into_hash(key);
       }
   }
 }

On unmmaping:

 key = hash(vma->vm_file);
 remove_from_hash(key);

Now if a new mmap happens where the vma->vm_file is reused, the lookup(key)
will return false again and the file_map event will get triggered again.

We don't need to look at the mmap() calls, as those new mappings may never
end up in a user stack trace, and writing them out will just waste space in
the ring buffer.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 11:11, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> The idea is this (pseudo code):
>
>  user_stack_trace() {
>    foreach vma in each stack frame:
>        key = hash(vma->vm_file);
>        if (!lookup(key)) {
>            trace_file_map(key, generate_path(vma), generate_buildid(vma));
>            add_into_hash(key);
>        }
>    }

I see *zero* advantage to this. It's only doing stupid things that
cost extra, and only because you don't want to do the smart thing that
I've explained extensively that has *NONE* of these overheads.

Just do the parsing at parse time. End of story.

Or don't do this at all. Justy forget the whole thing entirely. Throw
the patch that started this all away, and just DON'T DO THIS.

              Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 13:54:08 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, 29 Aug 2025 at 11:11, Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > The idea is this (pseudo code):
> >
> >  user_stack_trace() {
> >    foreach vma in each stack frame:
> >        key = hash(vma->vm_file);
> >        if (!lookup(key)) {
> >            trace_file_map(key, generate_path(vma), generate_buildid(vma));
> >            add_into_hash(key);
> >        }
> >    }  
> 
> I see *zero* advantage to this. It's only doing stupid things that
> cost extra, and only because you don't want to do the smart thing that
> I've explained extensively that has *NONE* of these overheads.
> 
> Just do the parsing at parse time. End of story.

What does "parsing at parse time" mean?

> 
> Or don't do this at all. Justy forget the whole thing entirely. Throw
> the patch that started this all away, and just DON'T DO THIS.

Maybe we are talking past each other.

When I get a user space stack trace, I get the virtual addresses of each of
the user space functions. This is saved into an user stack trace event in
the ring buffer that usually gets mapped right to a file for post
processing.

I still do the:

 user_stack_trace() {
   foreach addr each stack frame
      vma = vma_lookup(mm, addr);
      callchain[i++] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);

Are you saying that this shouldn't be done either? And to just record the
the virtual address in the chain and the vma->vm_start and
vma->vm_pgoff in another event? Where the post processing could do the
math? This other event could also record the path and build id.

The question is, when do I record this vma event? How do I know it's new?

I can't rely too much on other events (like mmap) and such as those events
may have occurred before the tracing started. I have to have some way to
know if the vma has been saved previously, which was why I had the hash
lookup, and only add vma's on new instances.

My main question is, when do I record the vma data event?

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 14:18, Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > Just do the parsing at parse time. End of story.
>
> What does "parsing at parse time" mean?

In user space. When parsing the trace events.

Not in kernel space, when generating the events.

Arnaldo already said it was workable.

> When I get a user space stack trace, I get the virtual addresses of each of
> the user space functions. This is saved into an user stack trace event in
> the ring buffer that usually gets mapped right to a file for post
> processing.
>
> I still do the:
>
>  user_stack_trace() {
>    foreach addr each stack frame
>       vma = vma_lookup(mm, addr);
>       callchain[i++] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
>
> Are you saying that this shouldn't be done either?

I'm saying that's ALL that should be done. And then that *ONE* single thing:

     callchain_filehash[i++] = hash(vma->vm_file);

BUT NOTHING ELSE.

None of that trace_file_map() garbage.

None of that add_into_hash() garbage.

NOTHING like that. You don't look at the hash. You don't "register"
it. You don't touch it in any way. You literally just use it as a
value, and user space will figure it out later. At event parsing time.

At most, you could have some trivial optimization to avoid hashing the
same pointer twice, ie have some single-entry cache of "it's still the
same file pointer, I'll just use the same hash I calculated last
time".

And I mean *single*-level, because siphash is fast enough that doing
anything *more* than that is going to be slower than just
re-calculating the hash.

In fact, you should probably do that optimization at the whole
vma_lookup() level, and try to not look up the same vma multiple times
when a common situation is probably that you'll have multiple stack
frames all with entries pointing to the same executable (or library)
mapping. Because "vma_lookup()" is likely about as expensive as the
hashing is.

           Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 15:40:07 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, 29 Aug 2025 at 14:18, Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > Just do the parsing at parse time. End of story.  
> >
> > What does "parsing at parse time" mean?  
> 
> In user space. When parsing the trace events.
> 
> Not in kernel space, when generating the events.
> 
> Arnaldo already said it was workable.

Perf does do things differently, as I believe it processes the events as it
reads from the kernel (Arnaldo correct me if I'm wrong).

For the tracefs code, the raw data gets saved directly into a file, and the
processing happens after the fact. If a tool is recording, it still needs a
way to know what those hash values mean, after the tracing is complete.

Same for when the user cats the "trace" file. If the vma's have already
been freed, when this happens, how do we map these hashes from the vma? Do
we need to have trace events in the unmap to trigger them? If tracing is
not recording anymore, those events will be dropped too. Also, we only want
to record the vmas that are in the stack traces. Not just any vma.

> 
> > When I get a user space stack trace, I get the virtual addresses of each of
> > the user space functions. This is saved into an user stack trace event in
> > the ring buffer that usually gets mapped right to a file for post
> > processing.
> >
> > I still do the:
> >
> >  user_stack_trace() {
> >    foreach addr each stack frame
> >       vma = vma_lookup(mm, addr);
> >       callchain[i++] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
> >
> > Are you saying that this shouldn't be done either?  
> 
> I'm saying that's ALL that should be done. And then that *ONE* single thing:
> 
>      callchain_filehash[i++] = hash(vma->vm_file);
> 
> BUT NOTHING ELSE.
> 
> None of that trace_file_map() garbage.
> 
> None of that add_into_hash() garbage.
> 
> NOTHING like that. You don't look at the hash. You don't "register"
> it. You don't touch it in any way. You literally just use it as a
> value, and user space will figure it out later. At event parsing time.

I guess this is where I'm stuck. How does user space know what those hash
values mean? Where does it get the information from?

> 
> At most, you could have some trivial optimization to avoid hashing the
> same pointer twice, ie have some single-entry cache of "it's still the
> same file pointer, I'll just use the same hash I calculated last
> time".
> 
> And I mean *single*-level, because siphash is fast enough that doing
> anything *more* than that is going to be slower than just
> re-calculating the hash.
> 
> In fact, you should probably do that optimization at the whole
> vma_lookup() level, and try to not look up the same vma multiple times
> when a common situation is probably that you'll have multiple stack
> frames all with entries pointing to the same executable (or library)
> mapping. Because "vma_lookup()" is likely about as expensive as the
> hashing is.

Yeah, we could add an optimization to store vma's in the callchain walk to
see if the next call chain belongs to a previous one. Could even just cache
the previous vma, as it's not as common to have one library calling into
another and back again.

That is, this would likely be useful:

  vma = NULL;
  foreach addr in callchain
    if (!vma || addr not in range of vma)
      vma = vma_lookup(addr);

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 16:09, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> Perf does do things differently, as I believe it processes the events as it
> reads from the kernel (Arnaldo correct me if I'm wrong).
>
> For the tracefs code, the raw data gets saved directly into a file, and the
> processing happens after the fact. If a tool is recording, it still needs a
> way to know what those hash values mean, after the tracing is complete.

But the data IS ALL THERE.

Really. That's the point.

It's there in the same file, it just needs those mmap events that
whoever pasrses it - whether it be perf, or somebody reading some
tracefs code - sees the mmap data, sees the cookies (hash values) that
implies, and then matches those cookies with the subsequent trace
entry cookies.

But what it does *NOT* need is munmap() events.

What it does *NOT* need is translating each hash value for each entry
by the kernel, when whoever treads the file can just remember and
re-create it in user space.

I'm done arguing. You're not listening, so I'll just let you know that
I'm not pulling garbage. I've had enough garbage in tracefs, I'm still
smarting from having to fix up the horrendous VFS interfaces, I'm not
going to pull anything that messes up this too.

        Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 17:45:39 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> But what it does *NOT* need is munmap() events.
> 
> What it does *NOT* need is translating each hash value for each entry
> by the kernel, when whoever treads the file can just remember and
> re-create it in user space.

If we are going to rely on mmap, then we might as well get rid of the
vma_lookup() altogether. The mmap event will have the mapping of the
file to the actual virtual address.

If we add a tracepoint at mmap that records the path and the address as
well as the permissions of the mapping, then the tracer could then
trace only those addresses that are executable.

To handle missed events, on start of tracing, trigger the mmap event
for every currently running tasks for their executable sections, and
that will allow the tracer to see where the files are mapped.

After that, the stack traces can go back to just showing the virtual
addresses of the user space stack without doing anything else. Let the
trace map the tasks memory to all the mmaps that happened and translate
it that way.

The downside is that there may be a lot of information to record. But
the tracer could choose which task maps to trace via filters and if it's
tracing all tasks, it just needs to make sure its buffer is big enough.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Sat, 30 Aug 2025 at 11:31, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> If we are going to rely on mmap, then we might as well get rid of the
> vma_lookup() altogether. The mmap event will have the mapping of the
> file to the actual virtual address.

It actually won't - not unless you also track every mremap etc.

Which is certainly doable, but I'd argue that it's a lot of complexity.

All you really want is an ID for the file mapping, and yes, I agree
that it's very very annoying that we don't have anything that can then
be correlated to user space any other way than also having a stage
that tracks mmap.

I've slept on it and tried to come up with something, and I can't. As
mentioned, the inode->i_ino isn't actually exposed to user space as
such at all for some common filesystems, so while it's very
traditional, it really doesn't actually work. It's also almost
impossible to turn into a path, which is what you often would want for
many cases.

That said, having slept on it, I'm starting to come around to the
inode number model, not because I think it's a good model - it really
isn't - but because it's a very historical mistake.

And in particular, it's the same mistake we made in /proc/<xyz>/maps.

So I think it's very very wrong, but it does have the advantage that
it's a number that we already do export.

But the inode we expose that way isn't actually the
'vma->vm_file->f_inode' as you'd think, it's actually

        inode = file_user_inode(vma->vm_file);

which is subtly different for the backing inode case (ie overlayfs).

Oh, how I dislike that thing, but using the same thing as
/proc/<xyz>/maps does avoid some problems.

                Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 3 weeks, 3 days ago

On Sat, 30 Aug 2025 12:03:53 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sat, 30 Aug 2025 at 11:31, Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > If we are going to rely on mmap, then we might as well get rid of the
> > vma_lookup() altogether. The mmap event will have the mapping of the
> > file to the actual virtual address.  
> 
> It actually won't - not unless you also track every mremap etc.
> 
> Which is certainly doable, but I'd argue that it's a lot of complexity.
> 
> All you really want is an ID for the file mapping, and yes, I agree
> that it's very very annoying that we don't have anything that can then
> be correlated to user space any other way than also having a stage
> that tracks mmap.
> 
> I've slept on it and tried to come up with something, and I can't. As
> mentioned, the inode->i_ino isn't actually exposed to user space as
> such at all for some common filesystems, so while it's very
> traditional, it really doesn't actually work. It's also almost
> impossible to turn into a path, which is what you often would want for
> many cases.
> 
> That said, having slept on it, I'm starting to come around to the
> inode number model, not because I think it's a good model - it really
> isn't - but because it's a very historical mistake.
> 
> And in particular, it's the same mistake we made in /proc/<xyz>/maps.
> 
> So I think it's very very wrong, but it does have the advantage that
> it's a number that we already do export.
> 
> But the inode we expose that way isn't actually the
> 'vma->vm_file->f_inode' as you'd think, it's actually
> 
>         inode = file_user_inode(vma->vm_file);
> 
> which is subtly different for the backing inode case (ie overlayfs).
> 
> Oh, how I dislike that thing, but using the same thing as
> /proc/<xyz>/maps does avoid some problems.
> 

Sorry for the late reply. I left to the Tracing Summit the following
Monday, and when I got back home on Thursday, I came down with a nasty cold
that prevented me from thinking about any of this.

I just re-read the entire thread, and I'm still not sure where to go with
this. Thus, let me start with what I'm trying to accomplish, and even add
one example of a real world use case we would like to have.

Several times we find issues with futexes causing applications to either
lock up or cause long latency. Since a futex is mostly managed in user
space, it's good to be able to at least have a backtrace of where a
contended futex occurs. Thus we start tracing the futex system call and
triggering a user space backtrace on each one. Using this information can
help us figure out where the futex contention lies. This is just one use
case, we do have others.

Ideally, the user space stack trace should look like:

   futex_requeue-1044    [002] .....   168.761423: <user stack unwind>
cookie=31500000003
 =>  <000000000009a9ee> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab}
 =>  <0000000000001472> : path=/work/c/futex_requeue build_id={0xc02417ea,0x1f4e0143,0x338cf27d,0x506a7a5d,0x7884d090}
 =>  <0000000000092b7b> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab}

Where the above shows the callstack (offset from the file), the path to the
file, and a build id of that file such that the tooling can verify that the
path is indeed the same library/executable as for when the trace occurred.

Note, the build-id isn't really necessary for my own use case, because the
applications seldom change on a chromebook. I added it as it appears to be
useful for others I've talked to that would like to use this.

But printing a copy of the full path name and build-id at every stack trace
is expensive. The path lookup may not be so bad, but the space on the ring
buffer is.

To compensate this, we could replace the path and build-id with a unique
identifier, (being an inode/device or hash, or whatever) to associate that
file. It may even work if it is unique per task. Then whenever one of these
identifiers were to show up representing a new file, it would be printed.

We could monitor an event that if a file is deleted, renamed, or whatever,
and a new file with the same name comes around, the identifier with the
path and build-id gets printed for the new file.

Where the output would be, instead:

             sed-1037    [007] ...1.   167.362583: file_map: hash=0x51eff94b path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab}
[..]
   futex_requeue-1042    [007] ...1.   168.754128: file_map: hash=0xad2c6f1b path=/work/c/futex_requeue build_id={0xc02417ea,0x1f4e0143,0x338cf27d,0x506a7a5d,0x7884d090}
[..]
   futex_requeue-1042    [007] .....   168.757912: <user stack unwind>
cookie=34900000008
 =>  <00000000001001ca> : 0x51eff94b
 =>  <000000000000173c> : 0xad2c6f1b
 =>  <0000000000029ca8> : 0x51eff94b
[.. repeats several more traces without having to save the path names again ..]

It comes down to when do we print these mappings?

I noticed that uprobes has hooks to all the mmappings in the vma code as it
needs to keep track of them. We could change those hooks to tracepoints,
and have both uprobes and tracing monitor the changes, and when a new
mapping happens, it traces it. Changing them to tracepoints may be useful
anyway, as it would then turn them over to static branchs and not a normal
"if" statement.

We could even add a file to tracefs that would trigger the dump of all
files that are mapped executable for all currently running tasks.Then when
tracing starts, it would trigger the "show all currently running task
mappings" and then only do the mappings on demand. This way, the tracer
would get the mappings of the identifier (or hash, or whatever) to the
files and build-ids at the start of tracing, as well as get any of the
mappings when they happen later on.

This should have enough information for the post processing to put the
stack traces back to what is ideal in the first place. That is, the tooling
could output:

   futex_requeue-1044    [002] .....   168.761423: <user stack unwind>
cookie=31500000003
 =>  <000000000009a9ee> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab}
 =>  <0000000000001472> : path=/work/c/futex_requeue build_id={0xc02417ea,0x1f4e0143,0x338cf27d,0x506a7a5d,0x7884d090}
 =>  <0000000000092b7b> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab}

and hide the identifier that was used in the ring buffer.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 3 weeks, 3 days ago

On Mon, 8 Sept 2025 at 14:42, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> I just re-read the entire thread, and I'm still not sure where to go with
> this.

So honestly, I don't know how to get where you want to get - or
whether it's even *possible* without horrible performance impact.

And no, we're not adding crap interfaces to mmap/munmap just for a
stupid sysfs tracing thing.

> Ideally, the user space stack trace should look like:
>
>    futex_requeue-1044    [002] .....   168.761423: <user stack unwind> cookie=31500000003
>  =>  <000000000009a9ee> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab}
>  =>  <0000000000001472> : path=/work/c/futex_requeue build_id={0xc02417ea,0x1f4e0143,0x338cf27d,0x506a7a5d,0x7884d090}
>  =>  <0000000000092b7b> : path=/usr/lib/x86_64-linux-gnu/libselinux.so.1 build_id={0x3ba6e0c2,0xdd815e8,0xe1821a58,0xa5940cef,0x7c7bc5ab}

Yes. And I think that's what you should aim to generate. Not inode
numbers, because inode numbers are the wrong thing.

> Note, the build-id isn't really necessary for my own use case, because the
> applications seldom change on a chromebook. I added it as it appears to be
> useful for others I've talked to that would like to use this.

My personal suspicion is that in reality, the pathname is sufficient.
It's certainly a lot better than inode numbers are, in that the
pathname is meaningful even after-the-fact, and even on a different
machine etc. It's not some guaranteed match with some particular
library or executable version, no. But for some random one-time quick
scripting thing that uses sysfs, it's probably "good enough".

The build id is certainly very convenient too, but it's not *always*
convenient. And 99% of the time you could just look up the build id
from the path, even though obviously that wouldn't work across
machines and wouldn't work across system updates.

> But printing a copy of the full path name and build-id at every stack trace
> is expensive. The path lookup may not be so bad, but the space on the ring
> buffer is.

So that's the thing. You can do it right, or you can do it wrong. I'd
personally tend to prefer the "expensive but right", and just make it
a trace-time option.

> To compensate this, we could replace the path and build-id with a unique
> identifier, (being an inode/device or hash, or whatever) to associate that
> file. It may even work if it is unique per task. Then whenever one of these
> identifiers were to show up representing a new file, it would be printed.

So I really hate the inode number, because it's just wrong.

You can't match it across machines, and to make things worse it's not
even *meaningful* over time or over machines - or to humans - so it's
strictly clearly objectively worse than the pathname.

But more importanly - rven on the *local* machine - and at the moment
- it's actually wrong.

Exactly because the inode number you look up is *not* the user-visible
inode number from 'stat()'.

So it's *really* wrong to use the inode number. It's basically never
right. And bever will be, even if you can make it appear useful in
some specific cases.

The *one* saving grace for the inode number is that *in*the*moment*
you can match it against /proc/<pid>/maps, because that /proc file has
that historical bug too (it wasn't buggy at the time that /proc file
was introduced, but our filesystems have become much more complex
since).

So if you do that

    inode = file_user_inode(vma->vm_file);

that I mentioned, at least the otherwise random inode numbers can be
matched to *something*.

That still doesn't fix the other issues with inode numbers, but it
means that at the time of the trace - and on the machine that the
tracing is done - you can now match that not-quite-real inode number
and device against another /proc file, and turn it into a pathname.

But it's kind of sad to do that, when you could just do the pathname
in the trace directly, and not force the stupid interface in the first
place.

And honestly, at that point it's still not really *better* than the
pathname (and arguably much much worse, because you might not be able
to do the matching if you didn't catch the /proc/<pid>/maps file).

So the inode number - together with a lookup in /proc/<pid>/maps - is
generally about the same as just giving a path, but typically much
less convenient, and anybody using that interface would have to do
extra work in user space.

And *none* of these issues would be true of somebody who uses the
'perf()' interface that can do all of this much more efficiently, and
without the downsides, and without any artificially limited sysfs
interfaces.

So that really makes me go: just don't expose this at all in sysfs
files.  You *cannot* do a good job in sysfs, because the interface is
strictly worse than just doing the proper job using perf.

Alternatively, just do the expensive thing. Expose the actual
pathname, and expose the build ID. Yes, it's expensive, but dammit,
that's the whole *point* of tracing in sysfs. sysfs was never about
being efficient, it was about convenience.

So if you trace through sysfs, you either don't get the full
information that could be there, or pay the price for the expense of
generating the full info.

Make the "give me the expensive output" be a dynamic flag, so that you
don't do it by default, but if you have some model where you are
scripting things with shell-script rather than doing 'perf record', at
least you get good output.

Hmm?

           Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 3 weeks, 3 days ago

On Mon, 8 Sep 2025 16:09:50 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> > To compensate this, we could replace the path and build-id with a unique
> > identifier, (being an inode/device or hash, or whatever) to associate that
> > file. It may even work if it is unique per task. Then whenever one of these
> > identifiers were to show up representing a new file, it would be printed.  
> 
> So I really hate the inode number, because it's just wrong.

I just mentioned an identifier, it didn't need to be the inode.

> 
> So if you do that
> 
>     inode = file_user_inode(vma->vm_file);

And if I do end up using an inode, I'll make sure to use that.

> And *none* of these issues would be true of somebody who uses the
> 'perf()' interface that can do all of this much more efficiently, and
> without the downsides, and without any artificially limited sysfs
> interfaces.

Note, there is no user space component running during the trace when
tracing with tracefs, whereas perf requires a user space tool to be
running along with what is being traced. The idea, is not to affect what is
being traced by a user space tracer. Tracing is started when needed, and
when the anomaly is detected, tracing is stopped, and then the tooling
extracts the trace and post processes it.

> 
> So that really makes me go: just don't expose this at all in sysfs
> files.  You *cannot* do a good job in sysfs, because the interface is
> strictly worse than just doing the proper job using perf.
> 
> Alternatively, just do the expensive thing. Expose the actual
> pathname, and expose the build ID. Yes, it's expensive, but dammit,
> that's the whole *point* of tracing in sysfs. sysfs was never about
> being efficient, it was about convenience.

Technically, it's "tracefs" and not "sysfs". When tracefs is configured,
sysfs will create a directory called /sys/kernel/tracing to allow user
space to mount tracefs there, but it is still a separate file system which
can be mounted outside of sysfs.

The code in tracefs is designed to be very efficient and tries very hard to
keep the overhead down. The tracefs ring buffer is still 2 or 3 times
faster than the perf buffer. It is optimized for tracing, and that's even
why it doesn't rely on a user space component, as it's another way to allow
always-on-tracing to not affect the system as much while tracing.

> 
> So if you trace through sysfs, you either don't get the full
> information that could be there, or pay the price for the expense of
> generating the full info.

But I will say the time to get the path name isn't an issue here. It's the
size of the path name being recorded into the ring buffer. The ring buffer's
size is limited, and a lot of compaction techniques are used to try to use
it efficiently.

As the stack trace only happens when the task goes back to user space, it's
not as a time sensitive event as say function tracing. Thus spending a few
more microseconds on something isn't going to cause much of a notice.

An 8 byte identifier for a file is much better than the path name where it
can be 40 bytes or more. In my example:

 /usr/lib/x86_64-linux-gnu/libselinux.so.1

is 41 bytes, 42 if you count the nul terminating byte.

Now if we just hash the path name, that would be doable. Then when we see a
new name pop up, we can trigger another event to show the path name (and
perhaps even the build id). What's nice about triggering another event to
show the full path name, is that you can put that other event into another
buffer, to keep the path names from dropping stack traces, and vice versa.

I liked an idea you had in a previous email:
https://lore.kernel.org/all/CAHk-=wjgdKtBAAu10W04VTktRcgEMZu+92sf1PW-TV-cfZO3OQ@mail.gmail.com/

    You do it for the first time you see it, and every N times afterwards
    (maybe by simply using a counter array that is indexed by the low bits
    of the hash, and incrementing it for every hash you see, and if it was
    zero modulo N you do that "mmap reminder" thing).

Instead of a N counter, have a time expiry of say 200 milliseconds or so. At
the stack trace, look at the path name, hash it, put it into a look up
table, and if it's not there, trigger the other event to show the path name
and save it and a timestamp into the look up table. If it's in the look up
table, and the timestamp is greater than 200 milliseconds, trigger it again
and reset the timestamp.

The idea is only to remove duplicates, and move the longer names into a
separate buffer.

Recording full path names in every stack trace will make it much harder to
use the information as it will lose a lot more stack traces.

> 
> Make the "give me the expensive output" be a dynamic flag, so that you
> don't do it by default, but if you have some model where you are
> scripting things with shell-script rather than doing 'perf record', at
> least you get good output.

Note, it's not a shell script. We do have tooling, it's just that nothing
runs while the trace is happening. The tooling starts the trace and exits.
When the issue is discovered, tracing is stopped, and the tooling will then
extract the trace and process it. If a crash occurs, the persistent ring
buffer can be extracted to get the data.

We will use this in the field. That is, on chromebooks where people have
opted in to allow analysis of their machines. If there's an anomaly
detected in thousands of users, we can start tracing and then extract the
traces to debug what is happening on their machines. We want to make sure
we get enough stack traces that will go back far enough to where the issue
first occurred. Hence why we want to keep the traces small and compact.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 3 weeks, 3 days ago

On Mon, 8 Sept 2025 at 16:09, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Make the "give me the expensive output" be a dynamic flag, so that you
> don't do it by default, but if you have some model where you are
> scripting things with shell-script rather than doing 'perf record', at
> least you get good output.

Side note: you could make that dynamic flag be basically "per-target",
by having the *tracer* open the files that it wants to match against,
and guaranteeing that the dentry stays around by virtue of having
opened the files.

Then - I'm handwaving a bit here - you could have some "hash the
dentry pointer" model.

In that model,  you couldn't use the 'struct file' hash, because now
you're matching against different 'open()' cases: the tracer that uses
sysfs would open the executable and the libraries it knows it is going
to trace, and keep them open for the duration of the trace in order to
have stable hashes for those files.

All the tracer would need is some simple interface to "give me the
hash for the file I just opened", and then it could easily match that
against any hashes it sees in sysfs stack traces.

The advantage of this model is that now the tracer not only has the
hash, and controls the lifetime, it means that the tracer also can
decide to look up build IDs etc if it wants to.

The disadvantage is obvious, though: this requires that the tracer
know what the files in question are. Of course, that's usually not
that hard. You might literally just know it a-priori (ie just from
what you are tracing along with having run 'ldd' etc), but for the
'I'm tracing a running process' you can use that /proc/<pid>/maps file
to start populating your hash information.

I'm *not* claiming this is a wonderful interface, but it's at least a
*fairly* straightforward way to give some kind of cheap hash ID for
the user space traces, and it puts the burden of "hash lifetime"
clearly on user space, not on the kernel having to try to maintain
some kind of hash map.

In other words: if user space wants to get good information, maybe
user space needs to work at it a bit.  The kernel side shouldn't be
made complicated or be expected to bend over backwards.

          Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Arnaldo Carvalho de Melo 1 month ago


On August 30, 2025 3:31:14 PM GMT-03:00, Steven Rostedt <rostedt@goodmis.org> wrote:
>On Fri, 29 Aug 2025 17:45:39 -0700
>Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> But what it does *NOT* need is munmap() events.
>> 
>> What it does *NOT* need is translating each hash value for each entry
>> by the kernel, when whoever treads the file can just remember and
>> re-create it in user space.
>
>If we are going to rely on mmap, then we might as well get rid of the
>vma_lookup() altogether. The mmap event will have the mapping of the
>file to the actual virtual address.
>
>If we add a tracepoint at mmap that records the path and the address as
>well as the permissions of the mapping, then the tracer could then
>trace only those addresses that are executable.
>

PERF_RECORD_MMAP2 (MMAP had just the filename);

<https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/perf_event.h#n1057>

>To handle missed events, on start of tracing, trigger the mmap event
>for every currently running tasks for their executable sections, and
>that will allow the tracer to see where the files are mapped.

Perf does synthesize the needed mmap events by traversing procfs, if needed.

Jiri at some point toyed with BPF iterators to do as you suggest: from the kernel iterate task structs and generate the PERF_RECORD_MMAP2 for preexisting processes.

>After that, the stack traces can go back to just showing the virtual
>addresses of the user space stack without doing anything else. Let the
>trace map the tasks memory to all the mmaps that happened and translate
>it that way.
>
>The downside is that there may be a lot of information to record.

It is, but for system wide cases, etc. Want to see it all? There's a cost...

- Arnaldo 

But
>the tracer could choose which task maps to trace via filters and if it's
>tracing all tasks, it just needs to make sure its buffer is big enough.
>
>-- Steve

- Arnaldo

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 17:45:39 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, 29 Aug 2025 at 16:09, Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > Perf does do things differently, as I believe it processes the events as it
> > reads from the kernel (Arnaldo correct me if I'm wrong).
> >
> > For the tracefs code, the raw data gets saved directly into a file, and the
> > processing happens after the fact. If a tool is recording, it still needs a
> > way to know what those hash values mean, after the tracing is complete.  
> 
> But the data IS ALL THERE.

But only in the kernel. How do I expose it?

> 
> Really. That's the point.
> 
> It's there in the same file, it just needs those mmap events that
> whoever pasrses it - whether it be perf, or somebody reading some

What mmap events are you talking about? Nothing happens to be tracing mmap
events. An interrupt triggered, we want a user space stack trace for that
interrupt, it records the kernel stack trace and a cookie that gets matched
to the user stack trace. It is then deferred until it goes back to user
space and the deferred infrastructure does a callback to the tracer with
the list of addresses that represent the user space call stack.

We do a vma_lookup() to get the vma of each of those addresses. Now we make
some hash that represents that vma for each address. But there has been no
event that maps to this vma to what the file is. And the vma's in these
stack traces are a subset of all the vma's. When the user finally gets
around to reading them, the vmas could be long gone. How is user space
supposed to find out what files they belong to?

Do we need to record most events to grab all the vma's and the files they
belong to? Note, one of the constraints to tracing is the buffer size. We
don't want to be recording information that we don't care about.

> tracefs code - sees the mmap data, sees the cookies (hash values) that
> implies, and then matches those cookies with the subsequent trace
> entry cookies.

That was basically what I was doing with the vma hash table. To print out
the vmas as soon as a new one is referenced. It created the event needed,
and only for the vmas we care about.

> 
> But what it does *NOT* need is munmap() events.

This wouldn't be recording munmap events. It would use the unmap event to
callback and remove the vma from the hash when they happened, so that if
they get reused the new ones would be printed. It's no different if we use
munmap or mmap. I could hook into the mmap event instead and check if it is
in the vma hash and if so, either reprint it, or remove it so if the vma is
in a call stack it would get reprinted.

Writing the file for every mmap seems to be a waste of ring buffer space if
the majority of them is not going to be in a stack trace.

> 
> What it does *NOT* need is translating each hash value for each entry
> by the kernel, when whoever treads the file can just remember and
> re-create it in user space.

What's reading the files? The applications that are being traced?

> 
> I'm done arguing. You're not listening, so I'll just let you know that

I am listening. I'm just not understanding you.

> I'm not pulling garbage. I've had enough garbage in tracefs, I'm still
> smarting from having to fix up the horrendous VFS interfaces, I'm not
> going to pull anything that messes up this too.

I know you keep bringing up the tracefs eventfs issue. Hey, I asked for
help with that when I first started it. I was basically told by some of the
VFS folks (I'm not going to name names) that "don't worry, if it works it's
fine". I was very worried that I wasn't doing it right. And it wasn't until
you got involved where you were the first one to tell me that using dentry
outside of VFS was a bad idea. Most of our arguing then was because I
didn't understand that. That also lead to the "garbage" code you had to fix
up.

So keep bringing that up. It just shows how much of tribal knowledge is
needed to work in the kernel. Heck, the VFS folks are still arguing about
how to handle things like kernfs. Which is similar to the eventfs issue.
And that boils down to things like kernefs, eventfs and procfs have a
fundamental difference to all other file systems. And that is it's a file
interface to the kernel itself, and not some external source. I realized
this during our arguments over eventfs. You do a write or read from a file,
and unlike other file systems, those actions trigger kernel functions
outside of vfs. But this is another topic altogether, and I only brought it
up because you did.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 21:20:23 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > I'm done arguing. You're not listening, so I'll just let you know that  
> 
> I am listening. I'm just not understanding you.

BTW, I'm not arguing with you. I'm really trying hard to figure out what it
is that you want me to do. I'm looking for that "don't use dentry outside
of VFS" moment.

I get we have in the call stack the offsets of the file and a magical hash
value that represents that vma.

What I don't get is what is user space suppose to match that magical hash
value to?

Do you want me to trace all mmaps and trigger an event for them that show
the hash value and the path names?

If that's the case, what do I do about the major use case of tracing an
application after it has mapped all it's memory to files?

What about wasted ring buffer space for recording every mmap when the
majority of them will not be used. It could risk dropping events of the
mmaps we care about.

Again, I'm not arguing with you. I'm trying to figure out what you are
suggesting.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 19:09:35 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> Yeah, we could add an optimization to store vma's in the callchain walk to
> see if the next call chain belongs to a previous one. Could even just cache
> the previous vma, as it's not as common to have one library calling into
> another and back again.

Although, it does happen with libc. :-p

cookie=300000004
 =>  <000000000008f687> : 0x666220af
 =>  <0000000000014560> : 0x88512fee
 =>  <000000000001f94a> : 0x88512fee
 =>  <000000000001fc9e> : 0x88512fee
 =>  <000000000001fcfa> : 0x88512fee
 =>  <000000000000ebae> : 0x88512fee
 =>  <0000000000029ca8> : 0x666220af

The 0x666220af is libc, where the first item is (according to objdump):

000000000008f570 <__libc_alloca_cutoff@@GLIBC_PRIVATE>:

And the last one (top of the stack) is:

0000000000029c20 <__libc_init_first@@GLIBC_2.2.5>:

Of course libc starts the application, and then the application will likely
call back into libc. We could optimize for this case with:

  first_vma = NULL;
  vma = NULL;
  foreach addr in callchain
    if (!first_vma)
      vma = first_vma = vma_alloc()
    else if (addr in range of first_vma)
      vma = first_vma
    else (addr not in range of vma)
      vma = vma_lookup(addr);

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 19:09:35 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > NOTHING like that. You don't look at the hash. You don't "register"
> > it. You don't touch it in any way. You literally just use it as a
> > value, and user space will figure it out later. At event parsing time.  
> 
> I guess this is where I'm stuck. How does user space know what those hash
> values mean? Where does it get the information from?

This is why I had the stored hash items in a "file_cache". It was the way
to know that a new vma is being used and needs an event to show it.

> 
> That is, this would likely be useful:
> 
>   vma = NULL;
>   foreach addr in callchain
>     if (!vma || addr not in range of vma)
>       vma = vma_lookup(addr);

You already stated that the vma_lookup() and the hash algorithm is very
expensive, but they need to be done anyway. A simple hash lookup is quick
and would be lost in the noise.

   vma = NULL;
   hash = 0;
   foreach addr in callchain
     if (!vma || addr not in range of vma) {
       vma = vma_lookup(addr);
       hash = get_hash(vma);
     }
     callchain[i] = addr - offset;
     hash[i] = hash;

I had that get_hash(vma) have something like:

  u32 get_hash(vma) {
     unsigned long ptr = (unsigned long)vma->vm_file;
     u32 hash;

     /* Remove alignment */
     ptr >>= 3;
     hash = siphash_1u32((u32)ptr, &key);

     if (lookup_hash(hash))
        return hash; // already saved

     // The above is the most common case and is quick.
     // Especially compared to vma_lookup() and the hash algorithm

     /* Slow but only happens when a new vma is discovered */
     trigger_event_that_maps_hash_to_file_data(hash, vma);

     /* Doesn't happen again for this hash value */
     save_hash(hash);

This is also where I would have a callback from munmap() to remove the
vmas from this hash table because they are no longer around. And if a new
vma came around with the same vm_file address, it would not be found in the
hash table and would trigger the print again with the hash and the new file
it represents.

This "garbage" was how I implemented the way to let user space know what
the meaning of the hash values are.

Otherwise, we need something else to expose to user space what those hashes
mean. And that's where I don't know what you are expecting.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 19:42:46 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

>    vma = NULL;
>    hash = 0;
>    foreach addr in callchain
>      if (!vma || addr not in range of vma) {
>        vma = vma_lookup(addr);
>        hash = get_hash(vma);
>      }
>      callchain[i] = addr - offset;
>      hash[i] = hash;
> 
> 
> I had that get_hash(vma) have something like:
> 
> 
>   u32 get_hash(vma) {
>      unsigned long ptr = (unsigned long)vma->vm_file;
>      u32 hash;
> 
>      /* Remove alignment */
>      ptr >>= 3;
>      hash = siphash_1u32((u32)ptr, &key);

Oh, this hash isn't that great, as it did appear to have collisions. But I
saw in vsprintf() it has something like:

#ifdef CONFIG_64BIT
	return (u32)(unsigned long)siphash_1u64((u64)ptr, &key);
#else
	return (u32)siphash_1u32((u32)ptr, &key);
#endif

Which for the 64 bit version, it uses all the bits to calculate the hash,
and the resulting bottom 32 is rather a good spread.

> 
>      if (lookup_hash(hash))
>         return hash; // already saved
> 
>      // The above is the most common case and is quick.
>      // Especially compared to vma_lookup() and the hash algorithm
> 
>      /* Slow but only happens when a new vma is discovered */
>      trigger_event_that_maps_hash_to_file_data(hash, vma);
> 
>      /* Doesn't happen again for this hash value */
>      save_hash(hash);

So this basically creates the output of:

       trace-cmd-1034    [003] .....   142.197674: <user stack unwind>
cookie=300000004
 =>  <000000000008f687> : 0x666220af
 =>  <0000000000014560> : 0x88512fee
 =>  <000000000001f94a> : 0x88512fee
 =>  <000000000001fc9e> : 0x88512fee
 =>  <000000000001fcfa> : 0x88512fee
 =>  <000000000000ebae> : 0x88512fee
 =>  <0000000000029ca8> : 0x666220af
       trace-cmd-1034    [003] ...1.   142.198063: file_cache: hash=0x666220af path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
       trace-cmd-1034    [003] ...1.   142.198093: file_cache: hash=0x88512fee path=/usr/local/bin/trace-cmd build_id={0x3f399e26,0xf9eb2d4d,0x475fa369,0xf5bb7eeb,0x6244ae85}


Where the first instances of the vma with the values of 0x666220af and
0x88512fee get printed, but from then on, they are not. That is, from then
on, the lookup will return true, and no processing will take place.

And periodically, I could clear the hash cache, so that all vmas get
printed again. But this would be rate limited to not cause performance
issues.


-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 08:47, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Anyway, the way to fix this is to not care about lifetimes at all:
> just treat the hash as the random number it is, and just accept the
> fact that the number gets actively reused and has no meaning.

Side note: the actual re-use of various pointers and/or inode numbers
is going to be very very random.

Classic old filesystems that live by inode numbers will use
'iget5_locked()' and will basically have the same 'struct inode *'
pointer too when they re-use an inode number.

And they likely also have a very simplistic inode allocation model and
a unlink followed by a file creation probably *will* re-use that same
inode number. So you can probably see 'struct inode *' get reused
quite quickly and reliably for entirely unrelated files just based on
file deletion/creation patterns.

The dentry pointer will typically stick around rather aggressively,
and will likely remain the same when you delete a file and create
another one with the same name, and the mnt pointer will stick around
too, so the contents of 'struct path' will be the exact same for two
completely different files across a delete/create event.

So hashing the path is very likely to stay the same as long as the
actual path stays the same, but would be fairly insensitive to the
underlying data changing. People might not care, particularly with
executables and libraries that simply don't get switched around much.

And, 'struct file *' will get reused randomly just based on memory
allocation issues, but I wouldn't be surprised if a close/open
sequence would get the same 'struct file *' pointer.

So these will all have various different 'value stays the same, but
the underlying data changed' patterns. I really think that you should
just treat the hash as a very random number, not assign it *any*
meaning at trace collection time, and the more random the better.

And then do all the "figure it out" work in user space when *looking*
at the traces. It might be a bit more work, and involve a bit more
data, but I _think_ it should be very straightforward to just do a
"what was the last mmap that had this hash"

               Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 09:07:58 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> The dentry pointer will typically stick around rather aggressively,
> and will likely remain the same when you delete a file and create
> another one with the same name, and the mnt pointer will stick around
> too, so the contents of 'struct path' will be the exact same for two
> completely different files across a delete/create event.

I'm not sure how often a trace would expand the event of running code,
deleting the code, recreating it, and running it again. But that means the
stack traces of the original code will be useless regardless. But at a
minimum, the recreating of the code should trigger another print, and this
would give it a different build-id (which I'm not recording as well as the
path).

> 
> So hashing the path is very likely to stay the same as long as the
> actual path stays the same, but would be fairly insensitive to the
> underlying data changing. People might not care, particularly with
> executables and libraries that simply don't get switched around much.
> 
> And, 'struct file *' will get reused randomly just based on memory
> allocation issues, but I wouldn't be surprised if a close/open
> sequence would get the same 'struct file *' pointer.
> 
> So these will all have various different 'value stays the same, but
> the underlying data changed' patterns. I really think that you should
> just treat the hash as a very random number, not assign it *any*
> meaning at trace collection time, and the more random the better.
> 
> And then do all the "figure it out" work in user space when *looking*
> at the traces. It might be a bit more work, and involve a bit more
> data, but I _think_ it should be very straightforward to just do a
> "what was the last mmap that had this hash"

I just realized that I'm using the rhashtable as an "does this hash exist".
I could get the content of the item that matches the hash and compare it to
what was used to create the hash in the first place. If there's a reference
counter or some other identifier I could use to know that the passed in vma
is the same as what is in the hash table, I can use this to know if the
hash needs to be updated with the new information or not.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 09:33, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> I just realized that I'm using the rhashtable as an "does this hash exist".

The question is still *why*?

Just use the hash. Don't do anything to it. Don't mess with it.

> I could get the content of the item that matches the hash and compare it to
> what was used to create the hash in the first place. If there's a reference
> counter or some other identifier I could use to know that the passed in vma
> is the same as what is in the hash table, I can use this to know if the
> hash needs to be updated with the new information or not.

No such information exists.

Sure, we have reference counts for everything: to a very close
approximation, any memory allocation with external visibility has to
be reference counted for correctness.

But those reference counts are never going to tell you whether they
are the *same* object that they were last time you looked at it, or
just a new allocation that happens to have the same pointer.

Don't even *TRY*.

You still haven't explained why you would care. Your patch that used
inode numbers didn't care. It just used the numbers.

SO JUST USE THE NUMBERS, for chissake! Don't make them mean anything.
Don't try to think they mean something.

The *reason* I htink hashing 'struct file *' is better than the
alternative is exactly that it *cannot* mean anything. It will get
re-used quite actively, even when nobody actually changes any of the
files. So you are forced to deal with this correctly, even though you
seem to be fighting dealing with it correctly tooth and nail.

And at no point have you explained why you can't just treat it as
meaningless numbers. The patch that started this all did exactly that.
It just used the *wrong* numbers, and I pointed out why they were
wrong, and why you shouldn't use those numbers.

          Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 09:42:03 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, 29 Aug 2025 at 09:33, Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > I just realized that I'm using the rhashtable as an "does this hash exist".  
> 
> The question is still *why*?

The reason is to keep from triggering the event that records the pathname
for every look up.

> 
> SO JUST USE THE NUMBERS, for chissake! Don't make them mean anything.
> Don't try to think they mean something.
> 
> The *reason* I htink hashing 'struct file *' is better than the
> alternative is exactly that it *cannot* mean anything. It will get
> re-used quite actively, even when nobody actually changes any of the
> files. So you are forced to deal with this correctly, even though you
> seem to be fighting dealing with it correctly tooth and nail.
> 
> And at no point have you explained why you can't just treat it as
> meaningless numbers. The patch that started this all did exactly that.
> It just used the *wrong* numbers, and I pointed out why they were
> wrong, and why you shouldn't use those numbers.

I agree. The hash I showed last time was just using the pointers. The hash
itself is meaningless and is useless by itself. The only thing the hash is
doing is to be an identifier in the stack trace so that the path name and
buildid don't need to be generated and saved every time.

In my other email, I'm thinking of using the pid / vma->vm_start as a key
to know if the pathname needs to be printed again or not. Although, perhaps
if a task does a dlopen(), load some text and execute it, then a dlclose()
and another dlopen() and loads text, that this could break the assumption
that the vm_start is unique per file.

Just to clarify, the goal of this exercise is to avoid the work of creating
and generating the pathnames and buildids for every lookup / stacktrace.

Now maybe hashing the pathname isn't as expensive as I think it may be. And
just doing that could be "good enough".

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 09:57, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> The reason is to keep from triggering the event that records the pathname
> for every look up.

BUT THAT WAS NEVER THE POINT.

There is only a single 64-bit number. No lookup. No pointer following.
No nothing.

The whole point of hashing was to get an *opaque* thing very quickly.
Not a pathname. No reference counting. No verifying whether you have
seen it before.

Literally just something that you can match up in the trace file much
much later.

(And, honestly, the likely thing is that you never match it up at all
- you can delay the "match it up" until a human actually looks at a
trace, which is presumably going to be a "one in a million" case).

              Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 10:02:40 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> (And, honestly, the likely thing is that you never match it up at all
> - you can delay the "match it up" until a human actually looks at a
> trace, which is presumably going to be a "one in a million" case).

Note the use case for this is for tooling that will be using these traces
for either flame graphs or for seeing where trouble areas are. That is, if
someone is enabling these stack traces, they most definitely will be looked
at. Maybe not directly by a human, but the tooling will and it will need the
mapping information.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 09:42, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Just use the hash. Don't do anything to it. Don't mess with it.

In fact, at actual stack trace time, don't even do the hashing. Just
save the raw pointer value (but as a *value*, not as a pointer: we
absolutely do *not* want people to think that the random value can be
used as a 'struct file' *: it needs to be a plain unsigned long, not
some kernel pointer).

Then the hashing can happen when you expose those entries to user
space (in the "print" stage). At that point you can do that

       hash = siphash_1u64(value, secret);

thing.

That will likely help I$ and D$ too, since you won't be accessing the
secret hashing data randomly, but do it only at trace output time
(presumably in a fairly tight loop at that point).

           Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Fri, 29 Aug 2025 09:50:12 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, 29 Aug 2025 at 09:42, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > Just use the hash. Don't do anything to it. Don't mess with it.  
> 
> In fact, at actual stack trace time, don't even do the hashing. Just
> save the raw pointer value (but as a *value*, not as a pointer: we
> absolutely do *not* want people to think that the random value can be
> used as a 'struct file' *: it needs to be a plain unsigned long, not
> some kernel pointer).
> 
> Then the hashing can happen when you expose those entries to user
> space (in the "print" stage). At that point you can do that
> 
>        hash = siphash_1u64(value, secret);
> 
> thing.
> 
> That will likely help I$ and D$ too, since you won't be accessing the
> secret hashing data randomly, but do it only at trace output time
> (presumably in a fairly tight loop at that point).

Note, the ring buffer can be mapped to user space. So anything written into
the buffer is already exposed. The "at trace output time" is done by user
space, not the kernel (except when using "trace" and "trace_pipe" files).

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 10:02, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> Note, the ring buffer can be mapped to user space. So anything written into
> the buffer is already exposed.

Oh, good point. Yeah, that means that you have to do the hashing
immediately. Too bad. Because while 'vma->vm_file' is basically free
(since you have to access the vma for other reasons anyway), a good
hash isn't.

siphash is good and fast for being what it is, but it's not completely
free. It's something like 50 shift/xor pairs, and it obviously needs
to also access that secret hash value that is likely behind a cache
miss..

Still, I suspect it's the best we've got.

(If hashing is noticeable, it *might* be worth it to use
'siphash_1u32()' and only hash 32 bits of the pointers. That makes the
hashing slightly cheaper, and since the low bits of the pointer will
be zero anyway due to alignment, and the high bits don't have a lot of
information in them either, it doesn't actually remove much
information. You might get collissions if the two pointers are exactly
32 GB apart or whatever, but that sounds really really unlucky)

                Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Arnaldo Carvalho de Melo 1 month ago


On August 29, 2025 2:13:33 PM GMT-03:00, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>On Fri, 29 Aug 2025 at 10:02, Steven Rostedt <rostedt@goodmis.org> wrote:
>>
>> Note, the ring buffer can be mapped to user space. So anything written into
>> the buffer is already exposed.
>
>Oh, good point. Yeah, that means that you have to do the hashing
>immediately. Too bad. Because while 'vma->vm_file' is basically free
>(since you have to access the vma for other reasons anyway), a good
>hash isn't.


Can't we continue with that idea by using some VMA sequential number that don't expose anything critical to user space but allows us to match stack entries to mmap records?

Deferring the heavily lift to when needed is great.

- Arnaldo 

>
>siphash is good and fast for being what it is, but it's not completely
>free. It's something like 50 shift/xor pairs, and it obviously needs
>to also access that secret hash value that is likely behind a cache
>miss..
>
>Still, I suspect it's the best we've got.
>
>(If hashing is noticeable, it *might* be worth it to use
>'siphash_1u32()' and only hash 32 bits of the pointers. That makes the
>hashing slightly cheaper, and since the low bits of the pointer will
>be zero anyway due to alignment, and the high bits don't have a lot of
>information in them either, it doesn't actually remove much
>information. You might get collissions if the two pointers are exactly
>32 GB apart or whatever, but that sounds really really unlucky)
>
>                Linus

- Arnaldo

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Fri, 29 Aug 2025 at 10:58, Arnaldo Carvalho de Melo
<arnaldo.melo@gmail.com> wrote:
>
> Can't we continue with that idea by using some VMA sequential number that don't expose anything critical to user space but allows us to match stack entries to mmap records?

No such record exists.

I guess we could add an atomic ID to do_mmap() and have a 64-bit value
that would be unique and would follow vma splitting and movement
around.

But it would actually be worse than just using the 'struct file'
pointer, and the only advantage would be avoiding the hashing. And the
disadvantages would be many. In particular, it would be much better if
it was per-file, but we are *definitely* not adding some globally
unique value to each file, because we already have seen performance
issues with open/close loads just from the atomic sequence counters we
used to give to anonymous inodes.

(I say "used to give" - see get_next_ino() for what we do now, with
per-cpu counter sequences. We'd have to do similar tricks for some
kind of 'file ID', and I really don't want to do that with no reason).

And if the only reason is "hashing takes a hundred cycles" when
generating traces, that's just not a reason good enough to bloat core
kernel data structures like 'struct file' and make core ops like
open() more expensive.

          Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

[ My last email of the night, as it's our anniversary, and I'm off to dinner now ;-) ]

On Thu, 28 Aug 2025 15:10:52 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, 28 Aug 2025 at 14:17, Steven Rostedt <rostedt@kernel.org> wrote:
> >
> > But that's unique per task, right? What I liked about the f_inode
> > pointer, is that it appears to be shared between tasks.  
> 
> I actually think the local meaning of the file pointer is an advantage.
> 
> It not only means that you see the difference in mappings of the same
> file created with different open calls, it also means that when
> different processes mmap the same executable, they don't see the same
> hash.
> 
> And because the file pointer doesn't have any long-term meaning, it
> also means that you also can't make the mistake of thinking the hash
> has a long lifetime. With an inode pointer hash, you could easily have
> software bugs that end up not realizing that it's a temporary hash,
> and that the same inode *will* get two different hashes if the inode
> has been flushed from memory and then loaded anew due to memory
> pressure.

This is a reasonable argument. But it is still nice to have the same value
for all tasks. This is for a "file_cache" that does get flushed regularly
(when various changes happen to the tracefs system).

It's only purpose is to map the user space stack trace hash value to a path
name (and build-id).

But yeah, I do not want another file to get flagged with the same hash.

> 
> > I only want to add a new hash and print the path for a new file. If
> > several tasks are using the same file (which they are with the
> > libraries), then having the hash be the same between tasks would be
> > more efficient.  
> 
> Why? See above why I think it's a mistake to think those hashes have
> lifetimes. They don't. Two different inodes can have the same hash due
> to lifetime issues, and the same inode can get two different hashes at
> different times for the same reason.
> 
> So you *need* to tie these things to the only lifetime that matters:
> the open/close pair (and the mmap - and the stack traces - will be
> part of that lifetime).
> 
> I literally think that you are not thinking about this right if you
> think you can re-use the hash.

I'm just worried about this causing slow downs, especially if I also track
the buildid. I did a quick update to the code to first use the f_inode and
get the build_id, and it gives:

       trace-cmd-1012    [003] ...1.    35.247318: inode_cache: hash=0xcb214087 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
       trace-cmd-1012    [003] ...1.    35.247333: inode_cache: hash=0x2565194a path=/usr/local/bin/trace-cmd build_id={0x3f399e26,0xf9eb2d4d,0x475fa369,0xf5bb7eeb,0x6244ae85}
       trace-cmd-1012    [003] ...1.    35.247419: inode_cache: hash=0x22dca920 path=/usr/local/lib64/libtracefs.so.1.8.2 build_id={0x6b040bdb,0x961f23d6,0xc1e1027e,0x7067c348,0xd069fa67}
       trace-cmd-1012    [003] ...1.    35.247455: inode_cache: hash=0xe87b6ea5 path=/usr/local/lib64/libtraceevent.so.1.8.4 build_id={0x8946b4eb,0xe3bf4ec5,0x11fd7d86,0xcd3105e2,0xe44a8d4d}
       trace-cmd-1012    [003] ...1.    35.247488: inode_cache: hash=0xafc34117 path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 build_id={0x379dc873,0x32bbdbc4,0x91eeb6cf,0xba549730,0xe2b96c55}
            bash-1003    [001] ...1.    35.248508: inode_cache: hash=0xcf9bd2d6 path=/usr/bin/bash build_id={0xd94aa36d,0x8e1f19c7,0xa4a69446,0x7338f602,0x20d66357}
  NetworkManager-581     [004] ...1.    35.703993: inode_cache: hash=0xea1c3e22 path=/usr/sbin/NetworkManager build_id={0x278c6dbb,0x4a1cdde6,0xa1a30a2c,0xbc417464,0x9dfaa28e}
            bash-1003    [001] ...1.    35.904817: inode_cache: hash=0x133252fa path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5 build_id={0xff2193a5,0xb2ece2f1,0x1bcbd242,0xca302a0b,0xc155fd26}
            bash-1013    [004] ...1.    37.716435: inode_cache: hash=0x53ae379b path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 build_id={0x4ed9e462,0xb302cd84,0x3ccf0104,0xbd80ac72,0x91c7fd44}
            bash-1013    [004] ...1.    37.722923: inode_cache: hash=0xa55a259e path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1 build_id={0xc2d9e5b6,0xb211e958,0xdef878e4,0xe4022df,0x9552253}

Now I changed it to be the file pointer, and it does give a bit more (see the duplicates):

    sshd-session-1004    [007] ...1.    98.940058: inode_cache: hash=0x41a6191a path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
       trace-cmd-1016    [006] ...1.    98.940089: inode_cache: hash=0xcc38a542 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
       trace-cmd-1016    [006] ...1.    98.940109: inode_cache: hash=0xa89cdd4b path=/usr/local/bin/trace-cmd build_id={0x3f399e26,0xf9eb2d4d,0x475fa369,0xf5bb7eeb,0x6244ae85}
       trace-cmd-1016    [006] ...1.    98.940410: inode_cache: hash=0xb3c570ca path=/usr/local/lib64/libtracefs.so.1.8.2 build_id={0x6b040bdb,0x961f23d6,0xc1e1027e,0x7067c348,0xd069fa67}
       trace-cmd-1016    [006] ...1.    98.940460: inode_cache: hash=0x4da4af85 path=/usr/local/lib64/libtraceevent.so.1.8.4 build_id={0x8946b4eb,0xe3bf4ec5,0x11fd7d86,0xcd3105e2,0xe44a8d4d}
       trace-cmd-1016    [006] ...1.    98.940513: inode_cache: hash=0xce16bd9d path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 build_id={0x379dc873,0x32bbdbc4,0x91eeb6cf,0xba549730,0xe2b96c55}
            bash-1007    [004] ...1.    98.941772: inode_cache: hash=0x772df671 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
            bash-1007    [004] ...1.    98.941911: inode_cache: hash=0xdb764962 path=/usr/bin/bash build_id={0xd94aa36d,0x8e1f19c7,0xa4a69446,0x7338f602,0x20d66357}
            bash-1007    [004] ...1.   100.080299: inode_cache: hash=0xef3bf212 path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5 build_id={0xff2193a5,0xb2ece2f1,0x1bcbd242,0xca302a0b,0xc155fd26}
           gmain-602     [003] ...1.   100.477235: inode_cache: hash=0xc9205658 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
       trace-cmd-1017    [005] ...1.   101.412116: inode_cache: hash=0x5a77751e path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 build_id={0x4ed9e462,0xb302cd84,0x3ccf0104,0xbd80ac72,0x91c7fd44}
       trace-cmd-1017    [005] ...1.   101.417004: inode_cache: hash=0xf2e95689 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
       trace-cmd-1017    [005] ...1.   101.418528: inode_cache: hash=0x5f35d3ca path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7 build_id={0x379dc873,0x32bbdbc4,0x91eeb6cf,0xba549730,0xe2b96c55}
       trace-cmd-1017    [005] ...1.   101.418572: inode_cache: hash=0x57feda78 path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1 build_id={0xc2d9e5b6,0xb211e958,0xdef878e4,0xe4022df,0x9552253}
       trace-cmd-1017    [005] ...1.   101.418620: inode_cache: hash=0x22ad5d84 path=/usr/local/lib64/libtraceevent.so.1.8.4 build_id={0x8946b4eb,0xe3bf4ec5,0x11fd7d86,0xcd3105e2,0xe44a8d4d}
       trace-cmd-1017    [005] ...1.   101.418666: inode_cache: hash=0x11c240a6 path=/usr/local/lib64/libtracefs.so.1.8.2 build_id={0x6b040bdb,0x961f23d6,0xc1e1027e,0x7067c348,0xd069fa67}
       trace-cmd-1017    [005] ...1.   101.418714: inode_cache: hash=0xf4e46cf path=/usr/local/bin/trace-cmd build_id={0x3f399e26,0xf9eb2d4d,0x475fa369,0xf5bb7eeb,0x6244ae85}
  wpa_supplicant-583     [000] ...1.   102.521195: inode_cache: hash=0xd20a587b path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
       trace-cmd-1018    [005] ...1.   102.847910: inode_cache: hash=0xee16ee8e path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
    sshd-session-1004    [000] ...1.   102.853561: inode_cache: hash=0x3404c7ea path=/usr/lib/openssh/sshd-session build_id={0x3b119855,0x5b15323e,0xe1ec337a,0xbd49f66e,0x78bddd0f}
   systemd-udevd-323     [007] ...1.   125.800839: inode_cache: hash=0x760273d5 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
 systemd-journal-294     [000] ...1.   125.800932: inode_cache: hash=0x77f34056 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
   systemd-udevd-323     [007] ...1.   125.801135: inode_cache: hash=0xe70bd063 path=/usr/lib/x86_64-linux-gnu/systemd/libsystemd-shared-257.so build_id={0x81d9bace,0x59f9953f,0x439928d7,0xe849d513,0xf2103286}
         systemd-1       [006] ...1.   125.801781: inode_cache: hash=0x42292844 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
         systemd-1       [006] ...1.   125.802811: inode_cache: hash=0x2cac8b3b path=/usr/lib/x86_64-linux-gnu/systemd/libsystemd-core-257.so build_id={0x580a80c5,0x931714d2,0xec54d3be,0xd5400bc0,0x6f2530ba}
         systemd-1       [006] ...1.   125.803740: inode_cache: hash=0xb17acaa6 path=/usr/lib/x86_64-linux-gnu/systemd/libsystemd-shared-257.so build_id={0x81d9bace,0x59f9953f,0x439928d7,0xe849d513,0xf2103286}
            cron-541     [006] ...1.   138.192640: inode_cache: hash=0x9285db61 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
  NetworkManager-581     [005] ...1.   144.716224: inode_cache: hash=0xf3c5bbc1 path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
  NetworkManager-581     [005] ...1.   144.716392: inode_cache: hash=0x381883bb path=/usr/sbin/NetworkManager build_id={0x278c6dbb,0x4a1cdde6,0xa1a30a2c,0xbc417464,0x9dfaa28e}
  NetworkManager-581     [005] ...1.   146.385151: inode_cache: hash=0x43451e15 path=/usr/lib/x86_64-linux-gnu/libglib-2.0.so.0.8400.0 build_id={0x9a7d3e29,0x5d8ed8f,0xe399da0,0xb5d373da,0x3ca1049b}
         chronyd-663     [001] ...1.   157.080405: inode_cache: hash=0xa0db647a path=/usr/lib/x86_64-linux-gnu/libc.so.6 build_id={0x10bddb6d,0xf5234181,0xc2f72e26,0x1aa4f797,0x6aa19eda}
         chronyd-663     [001] ...1.   158.152790: inode_cache: hash=0x1c471c4c path=/usr/sbin/chronyd build_id={0xf9588e62,0x3a8e6223,0x619fcb4f,0x12562bb,0x2ea104fb}

But maybe it's not enough to be an issue. But this will become more
predominate when sframes is built throughout. I only have a few
applications having sframes enabled so not every task is getting a full
stack trace, and hence, not all the files being touched is being displayed.

Just to clarify my concern. I want the stack traces to be quick and small.
I believe a 32 bit hash may be enough. And then have a side event that gets
updated when new files appear that can display much more information. This
side event may be slow which is why I don't want it to occur often. But I
do want it to occur for all new files.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Arnaldo Carvalho de Melo 1 month ago


On August 28, 2025 5:17:18 PM GMT-03:00, Steven Rostedt <rostedt@kernel.org> wrote:
>On Thu, 28 Aug 2025 12:18:39 -0700
>Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Thu, 28 Aug 2025 at 11:58, Arnaldo Carvalho de Melo
>> <arnaldo.melo@gmail.com> wrote:
>> > >
>> > >Give the damn thing an actual filename or something *useful*, not a
>> > >number that user space can't even necessarily match up to anything.  
>> >
>> > A build ID?  
>> 
>> I think that's a better thing than the disgusting inode number, yes.
>
>I don't care what it is. I picked inode/device just because it was the
>only thing I saw available. I'm not sure build ID is appropriate either.
>
>> 
>> That said, I think they are problematic too, in that I don't think
>> they are universally available, so if you want to trace some
>> executable without build ids - and there are good reasons to do that -
>> you might hate being limited that way.
>> 
>> So I think you'd be much better off with just actual pathnames.
>
>As you mentioned below, the reason I avoided path names is that they
>take up too much of the ring buffer, and would be duplicated all over
>the place. I've run this for a while, and it only picked up a couple of
>hundred paths while the trace had several thousand stack traces.
>
>> 
>> Are there no trace events for "mmap this path"? Create a good u64 hash
>> from the contents of a 'struct path' (which is just two pointers: the
>> dentry and the mnt) when mmap'ing the file, and then you can just
>> associate the stack trace entry with that hash.
>
>I would love to have a hash to use. The next patch does the mapping of
>the inode numbers to their path name. It can

The path name is a nice to have detail, but a content based hash is what we want, no?

Tracing/profiling has to be about contents of files later used for analysis, and filenames provide no guarantee about that.

- Arnaldo 

 easily be switched over to
>do the same with a hash number.
>
>> 
>> That should be simple and straightforward, and hashing two pointers
>> should be simple and straightforward.
>
>Would a hash of these pointers have any collisions? That would be bad.
>
>Hmm, I just tried using the pointer to vma->vm_file->f_inode, and that
>gives me a unique number. Then I just need to map that back to the path name:
>
>       trace-cmd-1016    [002] ...1.    34.675646: inode_cache: inode=ffff8881007ed428 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libc.so.6
>       trace-cmd-1016    [002] ...1.    34.675893: inode_cache: inode=ffff88811970e648 dev=[254:3] path=/usr/local/lib64/libtracefs.so.1.8.2
>       trace-cmd-1016    [002] ...1.    34.675933: inode_cache: inode=ffff88811970b8f8 dev=[254:3] path=/usr/local/lib64/libtraceevent.so.1.8.4
>       trace-cmd-1016    [002] ...1.    34.675981: inode_cache: inode=ffff888110b78ba8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7
>            bash-1007    [003] ...1.    34.677316: inode_cache: inode=ffff888103f05d38 dev=[254:3] path=/usr/bin/bash
>            bash-1007    [003] ...1.    35.432951: inode_cache: inode=ffff888116be94b8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5
>            bash-1018    [005] ...1.    36.104543: inode_cache: inode=ffff8881007e9dc8 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
>            bash-1018    [005] ...1.    36.110407: inode_cache: inode=ffff888110b78298 dev=[254:3] path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1
>            bash-1018    [005] ...1.    36.110536: inode_cache: inode=ffff888103d09dc8 dev=[254:3] path=/usr/local/bin/trace-cmd
>
>I just swapped out the inode with the above (unsigned long)vma->vm_file->f_inode,
>and it appears to be unique.
>
>Thus, I could use that as the "hash" value and then the above could be turned into:
>
>       trace-cmd-1016    [002] ...1.    34.675646: inode_cache: hash=ffff8881007ed428 path=/usr/lib/x86_64-linux-gnu/libc.so.6
>       trace-cmd-1016    [002] ...1.    34.675893: inode_cache: hash=ffff88811970e648 path=/usr/local/lib64/libtracefs.so.1.8.2
>       trace-cmd-1016    [002] ...1.    34.675933: inode_cache: hash=ffff88811970b8f8 path=/usr/local/lib64/libtraceevent.so.1.8.4
>       trace-cmd-1016    [002] ...1.    34.675981: inode_cache: hash=ffff888110b78ba8 path=/usr/lib/x86_64-linux-gnu/libzstd.so.1.5.7
>            bash-1007    [003] ...1.    34.677316: inode_cache: hash=ffff888103f05d38 path=/usr/bin/bash
>            bash-1007    [003] ...1.    35.432951: inode_cache: hash=ffff888116be94b8 path=/usr/lib/x86_64-linux-gnu/libtinfo.so.6.5
>            bash-1018    [005] ...1.    36.104543: inode_cache: hash=ffff8881007e9dc8 path=/usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
>            bash-1018    [005] ...1.    36.110407: inode_cache: hash=ffff888110b78298 path=/usr/lib/x86_64-linux-gnu/libz.so.1.3.1
>            bash-1018    [005] ...1.    36.110536: inode_cache: hash=ffff888103d09dc8 path=/usr/local/bin/trace-cmd
>
>This would mean the readers of the userstacktrace_delay need to also
>have this event enabled to do the mappings. But that shouldn't be an
>issue.
>
>-- Steve
>

- Arnaldo

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Thu, 28 Aug 2025 17:27:37 -0300
Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote:

> >I would love to have a hash to use. The next patch does the mapping
> >of the inode numbers to their path name. It can  
> 
> The path name is a nice to have detail, but a content based hash is
> what we want, no?
> 
> Tracing/profiling has to be about contents of files later used for
> analysis, and filenames provide no guarantee about that.

I could add the build id to the inode_cache as well (which I'll rename
to file_cache).

Thus, the user stack trace will just have the offset and a hash value
that will be match the output of the file_cache event which will have
the path name and a build id (if one exists).

Would that work?

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Arnaldo Carvalho de Melo 1 month ago


On August 28, 2025 5:51:39 PM GMT-03:00, Steven Rostedt <rostedt@kernel.org> wrote:
>On Thu, 28 Aug 2025 17:27:37 -0300
>Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote:
>
>> >I would love to have a hash to use. The next patch does the mapping
>> >of the inode numbers to their path name. It can  
>> 
>> The path name is a nice to have detail, but a content based hash is
>> what we want, no?
>> 
>> Tracing/profiling has to be about contents of files later used for
>> analysis, and filenames provide no guarantee about that.
>
>I could add the build id to the inode_cache as well (which I'll rename
>to file_cache).
>
>Thus, the user stack trace will just have the offset and a hash value
>that will be match the output of the file_cache event which will have
>the path name and a build id (if one exists).
>
>Would that work?

Probably.

This "if it is available" question is valid, but since 2016 it's is more of a "did developers disabled it explicitly?"

If my "googling" isn't wrong, GNU LD defaults to generating a build ID in ELF images since 2011 and clang's companion since 2016.

So making it even more available than what the BPF guys did long ago and perf piggybacked on at some point, by having it cached, on request?, in some 20 bytes alignment hole in task_struct that would be only used when profiling/tracing may be amenable.

- Arnaldo

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Sam James 1 month ago

Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> writes:

> On August 28, 2025 5:51:39 PM GMT-03:00, Steven Rostedt <rostedt@kernel.org> wrote:
>>On Thu, 28 Aug 2025 17:27:37 -0300
>>Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote:
>>
>>> >I would love to have a hash to use. The next patch does the mapping
>>> >of the inode numbers to their path name. It can  
>>> 
>>> The path name is a nice to have detail, but a content based hash is
>>> what we want, no?
>>> 
>>> Tracing/profiling has to be about contents of files later used for
>>> analysis, and filenames provide no guarantee about that.
>>
>>I could add the build id to the inode_cache as well (which I'll rename
>>to file_cache).
>>
>>Thus, the user stack trace will just have the offset and a hash value
>>that will be match the output of the file_cache event which will have
>>the path name and a build id (if one exists).
>>
>>Would that work?
>
> Probably.
>
> This "if it is available" question is valid, but since 2016 it's is more of a "did developers disabled it explicitly?"
>
> If my "googling" isn't wrong, GNU LD defaults to generating a build ID in ELF images since 2011 and clang's companion since 2016.

GNU ld doesn't ever default to generating build IDs, and I don't *think*
LLVM does either (either in Clang, or LLD).

GCC, on the other hand, has a configure arg to control this, but it's
default-off. Clang generally prefers to have defaults like this done via
user/sysadmin specified configuration files rather than adding
build-time configure flags.

Now, is it a reasonable ask to say "we require build IDs for this
feature"? Yeah, it probably is, but it's not default-on right now, and
indeed we in Gentoo aren't using them yet (but I'm working on enabling
them).

>
> So making it even more available than what the BPF guys did long ago
> and perf piggybacked on at some point, by having it cached, on
> request?, in some 20 bytes alignment hole in task_struct that would be
> only used when profiling/tracing may be amenable.

thanks,
sam

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Steven Rostedt 1 month ago

On Thu, 28 Aug 2025 18:00:22 -0300
Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote:

> >Thus, the user stack trace will just have the offset and a hash value
> >that will be match the output of the file_cache event which will have
> >the path name and a build id (if one exists).
> >
> >Would that work?  
> 
> Probably.
> 
> This "if it is available" question is valid, but since 2016 it's is
> more of a "did developers disabled it explicitly?"

The "if one exists" comment is that it's not a requirement. If none
exists, it would just add a zero.

> 
> If my "googling" isn't wrong, GNU LD defaults to generating a build
> ID in ELF images since 2011 and clang's companion since 2016.
> 
> So making it even more available than what the BPF guys did long ago
> and perf piggybacked on at some point, by having it cached, on
> request?, in some 20 bytes alignment hole in task_struct that would
> be only used when profiling/tracing may be amenable.

Would perf be interested in this hash file lookup?

I know perf is reliant on user space more than ftrace is, and has a lot
of work happening in user space while getting stack traces. With
ftrace, there's on real user space requirement, thus a lot of the work
needs to be done in the kernel.

If we go with a hash to file, it's somewhat useless by itself without a
way to map the hash to file/buildid.

I originally started making this hash->file a file in tracefs. But then
I needed to figure out how to manage the allocations. Do I add a "size"
for that file and start dropping mappings when it reaches that limit.
Then I may need to add a LRU algorithm to do so. I found simply having
an event that wrote out the mappings was so much easier to implement.

But the file_cache code could be used by perf, where perf does the same
and just monitors the file_cache event. I could make the API more
global than just the kernel/trace directory.

-- Steve

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Thu, 28 Aug 2025 at 13:27, Arnaldo Carvalho de Melo
<arnaldo.melo@gmail.com> wrote:
>
> The path name is a nice to have detail, but a content based hash is what we want, no?

No.

We want something easy and quick to compute because this is looked up
at stacktrace time. That's the primary issue.

You can do the mapping to some build-id - if it even exists - later.

               Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Arnaldo Carvalho de Melo 1 month ago

On Thu, Aug 28, 2025 at 12:18:39PM -0700, Linus Torvalds wrote:
> On Thu, 28 Aug 2025 at 11:58, Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com> wrote:

> > >Give the damn thing an actual filename or something *useful*, not a
> > >number that user space can't even necessarily match up to anything.

> > A build ID?
 
> I think that's a better thing than the disgusting inode number, yes.

> That said, I think they are problematic too, in that I don't think
> they are universally available, so if you want to trace some
> executable without build ids - and there are good reasons to do that -
> you might hate being limited that way.

Right, but these days gdb (and other traditional tools) supports it and
downloads it (perf should do it with a one-time sticky question too,
does it already in some cases, unconditionally, that should be fixed as
well), most distros have it:

⬢ [acme@toolbx perf-tools-next]$ file /bin/bash
/bin/bash: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=707a1c670cd72f8e55ffedfbe94ea98901b7ce3a, for GNU/Linux 3.2.0, stripped
⬢ [acme@toolbx perf-tools-next]$

We have debuginfod-servers that brings ELF images with debug keyed by
that build id and finally build-ids come together with pathnames, so if
one is null, fallback to the other.

Default in fedora:

⬢ [acme@toolbx perf-tools-next]$ echo $DEBUGINFOD_
$DEBUGINFOD_IMA_CERT_PATH  $DEBUGINFOD_URLS           
⬢ [acme@toolbx perf-tools-next]$ echo $DEBUGINFOD_
$DEBUGINFOD_IMA_CERT_PATH  $DEBUGINFOD_URLS           
⬢ [acme@toolbx perf-tools-next]$ echo $DEBUGINFOD_IMA_CERT_PATH 
/etc/keys/ima:
⬢ [acme@toolbx perf-tools-next]$ echo $DEBUGINFOD_URLS 
https://debuginfod.fedoraproject.org/
⬢ [acme@toolbx perf-tools-next]$

I wasn't aware of that IMA stuff.

So even without the mandate and with sometimes not being able to get
that build-id, most of the time they are there and deterministically
allows tooling to fetch it in most cases, I guess that is as far as we
can pragmatically get.

- Arnaldo
 
> So I think you'd be much better off with just actual pathnames.
> 
> Are there no trace events for "mmap this path"? Create a good u64 hash
> from the contents of a 'struct path' (which is just two pointers: the
> dentry and the mnt) when mmap'ing the file, and then you can just
> associate the stack trace entry with that hash.
> 
> That should be simple and straightforward, and hashing two pointers
> should be simple and straightforward.
> 
> And then matching that hash against the mmap event where the actual
> path was saved off gives you an actual *pathname*. Which is *so* much
> better than those horrific inode numbers.
> 
> And yes, yes, obviously filenames can go away and aren't some kind of
> long-term stable thing. But inode numbers can be re-used too, so
> that's no different.
> 
> With the "create a hash of 'struct path' contents" you basically have
> an ID that can be associated with whatever the file name was at the
> time it was mmap'ed into the thing you are tracing, which is I think
> what you really want anyway.
> 
> Now, what would be even simpler is to not create a hash at all, but
> simply just create the whole pathname when the stack trace entry is
> created. But it would probably waste too much space, since you'd
> probably want to have at least 32 bytes (as opposed to just 64 bits)
> for a (truncated) pathname.
> 
> And it would be more expensive than just hashing the dentry/mnt
> pointers, although '%pD' isn't actually *that* expensive. But probably
> expensive enough to not really be acceptable. I'm just throwing it out
> as a stupid idea that at least generates much more usable output than
> the inode numbers do.
> 
>           Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Linus Torvalds 1 month ago

On Thu, 28 Aug 2025 at 13:04, Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
>
> > That said, I think they are problematic too, in that I don't think
> > they are universally available, so if you want to trace some
> > executable without build ids - and there are good reasons to do that -
> > you might hate being limited that way.
>
> Right, but these days gdb (and other traditional tools) supports it and
> downloads it (perf should do it with a one-time sticky question too,
> does it already in some cases, unconditionally, that should be fixed as
> well), most distros have it:

So I'm literally thinking one very valid case is debugging some
third-party binary with tracing.

End result: the whole "most distros have it" is just not relevant to
that situation.

          Linus

Re: [PATCH v6 5/6] tracing: Show inode and device major:minor in deferred user space stacktrace

Posted by Mathieu Desnoyers 1 month ago

On 2025-08-28 14:58, Arnaldo Carvalho de Melo wrote:
> 
> 
> On August 28, 2025 3:39:35 PM GMT-03:00, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>> On Thu, 28 Aug 2025 at 11:05, Steven Rostedt <rostedt@kernel.org> wrote:
>>>
>>> The deferred user space stacktrace event already does a lookup of the vma
>>> for each address in the trace to get the file offset for those addresses,
>>> it can also report the file itself.
>>
>> That sounds like a good idea..
>>
>> But the implementation absolutely sucks:
>>
>>> Add two more arrays to the user space stacktrace event. One for the inode
>>> number, and the other to store the device major:minor number. Now the
>>> output looks like this:
>>
>> WTF? Why are you back in the 1960's? What's next? The index into the
>> paper card deck?
>>
>> Stop using inode numbers and device numbers already. It's the 21st
>> century. No, cars still don't fly, but dammit, inode numbers were a
>> great idea back in the days, but they are not acceptable any more.
>>
>> They *particularly* aren't acceptable when you apparently think that
>> they are 'unsigned long'.  Yes, that's the internal representation we
>> use for inode indexing, but for example on nfs the inode is actually
>> bigger. It's exposed to user space as a u64 through
>>
>>         stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>>
>> so the inode that user space sees in 'struct stat' (a) doesn't
>> actually match inode->i_ino, and (b) isn't even the full file ID that
>> NFS actually uses.
>>
>> Let's not let that 60's thinking be any part of a new interface.
>>
>> Give the damn thing an actual filename or something *useful*, not a
>> number that user space can't even necessarily match up to anything.
>>
> 
> A build ID?
> 
> PERF_RECORD_MMAP went thru this, filename ->  inode -> Content based hash

FWIW, we record:

- executable or shared library path name,
- build id (if available),
- debug link (if available),

in LTTng-UST when we dump the loaded executable and libraries from
userspace.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com