[PATCH] fs: inline step_into() and walk_component()

Mateusz Guzik posted 1 patch 1 week, 5 days ago
fs/namei.c | 67 ++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 50 insertions(+), 17 deletions(-)
[PATCH] fs: inline step_into() and walk_component()
Posted by Mateusz Guzik 1 week, 5 days ago
The primary consumer is link_path_walk(), calling walk_component() every
time which in turn calls step_into().

Inlining these saves overhead of 2 function calls per path component,
along with allowing the compiler to do better job optimizing them in place.

step_into() had absolutely atrocious assembly to facilitate the
slowpath. In order to lessen the burden at the callsite all the hard
work is moved into step_into_slowpath(). This also elides some of the
branches as for example LOOKUP_RCU is only checked once.

The inline-able step_into() variant deals with the common case of
traversing a cached non-mounted on directory while in RCU mode.

Since symlink handling is already denoted as unlikely(), I took the
opportunity to also shorten step_into_slowpath() by moving parts of it
into pick_link() which further shortens assembly.

Benchmarked as follows on Sapphire Rapids:
1. the "before" was a kernel with not-yet-merged optimizations (notably
   elision of calls to security_inode_permissin() and marking ext4
   inodes as not having acls)
2. "after" is the same + this patch
3. benchmark consists of issuing 205 calls to access(2) in a loop with
   pathnames lifted out of gcc and the linker building real code, most
   of which have several path components and 118 of which fail with
   -ENOENT. Some of those do symlink traversal.

In terms of ops/s:
before:	21619
after:	22536 (+4%)

profile before:
  20.25%  [kernel]                  [k] __d_lookup_rcu
  10.54%  [kernel]                  [k] link_path_walk
  10.22%  [kernel]                  [k] entry_SYSCALL_64
   6.50%  libc.so.6                 [.] __GI___access
   6.35%  [kernel]                  [k] strncpy_from_user
   4.87%  [kernel]                  [k] step_into
   3.68%  [kernel]                  [k] kmem_cache_alloc_noprof
   2.88%  [kernel]                  [k] walk_component
   2.86%  [kernel]                  [k] kmem_cache_free
   2.14%  [kernel]                  [k] set_root
   2.08%  [kernel]                  [k] lookup_fast

after:
  23.38%  [kernel]                  [k] __d_lookup_rcu
  11.27%  [kernel]                  [k] entry_SYSCALL_64
  10.89%  [kernel]                  [k] link_path_walk
   7.00%  libc.so.6                 [.] __GI___access
   6.88%  [kernel]                  [k] strncpy_from_user
   3.50%  [kernel]                  [k] kmem_cache_alloc_noprof
   2.01%  [kernel]                  [k] kmem_cache_free
   2.00%  [kernel]                  [k] set_root
   1.99%  [kernel]                  [k] lookup_fast
   1.81%  [kernel]                  [k] do_syscall_64
   1.69%  [kernel]                  [k] entry_SYSCALL_64_safe_stack

While walk_component() and step_into() of course disappear from the
profile, the link_path_walk() barely gets more overhead despite the
inlining thanks to the fast path added and while completing more walks
per second.

I don't know why overhead grew a lot on __d_lookup_rcu().

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

perhaps this could be 2-3 patches instead to do things incrementally
the other patches not listed in the commit message are:
1. mntput_not_expire slowpath (Christian took it, not present in fs-next yet)
2. nd->depth predicts.
3. lookup_slow noinline

all of those sent

you may notice step_into is quite high on the profile:
   4.87%  [kernel]                  [k] step_into

here is the routine prior to the patch:
    call   ffffffff81374630 <__fentry__>
    push   %rbp
    mov    %rsp,%rbp
    push   %r15
    push   %r14
    mov    %esi,%r14d
    push   %r13
    push   %r12
    push   %rbx
    mov    %rdi,%rbx
    sub    $0x28,%rsp
    mov    (%rdi),%rax
    mov    0x38(%rdi),%r8d
    mov    %gs:0x2e10acd(%rip),%r12        # ffffffff84551008 <__stack_chk_guard>

    mov    %r12,0x20(%rsp)
    mov    %rdx,%r12
    mov    %rdx,0x18(%rsp)
    mov    %rax,0x10(%rsp)

This is setup before it even gets to do anything which of course has
to be undone later. Also note the stackguard check. I'm not pasting
everything you can disasm yourself to check the entire monstrosity.

In contrast this is entirety of step_into() fast path with the patch + noinline:
         call   ffffffff81374630 <__fentry__>
         testb  $0x1,0x39(%rdi)
         je     ffffffff81740cbe <step_into+0x4e>
         mov    (%rdx),%eax
         test   $0x38000,%eax
         jne    ffffffff81740cbe <step_into+0x4e>
         and    $0x380000,%eax
         mov    0x30(%rdx),%rcx
         cmp    $0x300000,%eax
         je     ffffffff81740cbe <step_into+0x4e>
         mov    (%rdi),%r8
         mov    0x44(%rdi),%esi
         mov    0x4(%rdx),%eax
         cmp    %eax,%esi
         jne    ffffffff81740cc3 <step_into+0x53>
         test   %rcx,%rcx
         je     ffffffff81740ccf <step_into+0x5f>
         mov    0x44(%rdi),%eax
         mov    %r8,(%rdi)
         mov    %rdx,0x8(%rdi)
         mov    %eax,0x40(%rdi)
         xor    %eax,%eax
         mov    %rcx,0x30(%rdi)
         jmp    ffffffff8230b1f0 <__pi___x86_return_thunk>

This possibly can be shortened but I have not tried yet.

 fs/namei.c | 67 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1d1f864ad6ad..e00b9ce21536 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1668,17 +1668,17 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 	bool jumped;
 	int ret;
 
-	path->mnt = nd->path.mnt;
-	path->dentry = dentry;
 	if (nd->flags & LOOKUP_RCU) {
 		unsigned int seq = nd->next_seq;
+		if (likely(!(dentry->d_flags & DCACHE_MANAGED_DENTRY)))
+			return 0;
 		if (likely(__follow_mount_rcu(nd, path)))
 			return 0;
 		// *path and nd->next_seq might've been clobbered
 		path->mnt = nd->path.mnt;
 		path->dentry = dentry;
 		nd->next_seq = seq;
-		if (!try_to_unlazy_next(nd, dentry))
+		if (unlikely(!try_to_unlazy_next(nd, dentry)))
 			return -ECHILD;
 	}
 	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
@@ -1941,12 +1941,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link)
 
 enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
 
-static const char *pick_link(struct nameidata *nd, struct path *link,
+static noinline const char *pick_link(struct nameidata *nd, struct path *link,
 		     struct inode *inode, int flags)
 {
 	struct saved *last;
 	const char *res;
-	int error = reserve_stack(nd, link);
+	int error;
+
+	if (nd->flags & LOOKUP_RCU) {
+		/* make sure that d_is_symlink above matches inode */
+		if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
+			return ERR_PTR(-ECHILD);
+	} else {
+		if (link->mnt == nd->path.mnt)
+			mntget(link->mnt);
+	}
+
+	error = reserve_stack(nd, link);
 
 	if (unlikely(error)) {
 		if (!(nd->flags & LOOKUP_RCU))
@@ -2021,14 +2032,17 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
  *
  * NOTE: dentry must be what nd->next_seq had been sampled from.
  */
-static const char *step_into(struct nameidata *nd, int flags,
+static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
 		     struct dentry *dentry)
 {
 	struct path path;
 	struct inode *inode;
-	int err = handle_mounts(nd, dentry, &path);
+	int err;
 
-	if (err < 0)
+	path.mnt = nd->path.mnt;
+	path.dentry = dentry;
+	err = handle_mounts(nd, dentry, &path);
+	if (unlikely(err < 0))
 		return ERR_PTR(err);
 	inode = path.dentry->d_inode;
 	if (likely(!d_is_symlink(path.dentry)) ||
@@ -2050,17 +2064,36 @@ static const char *step_into(struct nameidata *nd, int flags,
 		nd->seq = nd->next_seq;
 		return NULL;
 	}
-	if (nd->flags & LOOKUP_RCU) {
-		/* make sure that d_is_symlink above matches inode */
-		if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
-			return ERR_PTR(-ECHILD);
-	} else {
-		if (path.mnt == nd->path.mnt)
-			mntget(path.mnt);
-	}
 	return pick_link(nd, &path, inode, flags);
 }
 
+static __always_inline const char *step_into(struct nameidata *nd, int flags,
+		     struct dentry *dentry)
+{
+	struct path path;
+	struct inode *inode;
+
+	path.mnt = nd->path.mnt;
+	path.dentry = dentry;
+	if (!(nd->flags & LOOKUP_RCU))
+		goto slowpath;
+	if (unlikely(dentry->d_flags & DCACHE_MANAGED_DENTRY))
+		goto slowpath;
+	inode = path.dentry->d_inode;
+	if (unlikely(d_is_symlink(path.dentry)))
+		goto slowpath;
+	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
+		return ERR_PTR(-ECHILD);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOENT);
+	nd->path = path;
+	nd->inode = inode;
+	nd->seq = nd->next_seq;
+	return NULL;
+slowpath:
+	return step_into_slowpath(nd, flags, dentry);
+}
+
 static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct dentry *parent, *old;
@@ -2171,7 +2204,7 @@ static const char *handle_dots(struct nameidata *nd, int type)
 	return NULL;
 }
 
-static const char *walk_component(struct nameidata *nd, int flags)
+static __always_inline const char *walk_component(struct nameidata *nd, int flags)
 {
 	struct dentry *dentry;
 	/*
-- 
2.48.1
Re: [PATCH] fs: inline step_into() and walk_component()
Posted by Al Viro 1 week, 5 days ago
On Wed, Nov 19, 2025 at 07:40:01PM +0100, Mateusz Guzik wrote:

> -static const char *pick_link(struct nameidata *nd, struct path *link,
> +static noinline const char *pick_link(struct nameidata *nd, struct path *link,
>  		     struct inode *inode, int flags)
>  {
>  	struct saved *last;
>  	const char *res;
> -	int error = reserve_stack(nd, link);
> +	int error;
> +
> +	if (nd->flags & LOOKUP_RCU) {
> +		/* make sure that d_is_symlink above matches inode */

Where would that "above" be?  Have some pity on the readers...

> +static __always_inline const char *step_into(struct nameidata *nd, int flags,
> +		     struct dentry *dentry)
> +{
> +	struct path path;
> +	struct inode *inode;
> +
> +	path.mnt = nd->path.mnt;
> +	path.dentry = dentry;
> +	if (!(nd->flags & LOOKUP_RCU))
> +		goto slowpath;
> +	if (unlikely(dentry->d_flags & DCACHE_MANAGED_DENTRY))
> +		goto slowpath;
> +	inode = path.dentry->d_inode;
> +	if (unlikely(d_is_symlink(path.dentry)))
> +		goto slowpath;
> +	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
> +		return ERR_PTR(-ECHILD);
> +	if (unlikely(!inode))
> +		return ERR_PTR(-ENOENT);
> +	nd->path = path;
> +	nd->inode = inode;
> +	nd->seq = nd->next_seq;
> +	return NULL;
> +slowpath:
> +	return step_into_slowpath(nd, flags, dentry);
> +}

Is there any point keeping that struct path here?  Hell, what's wrong
with

static __always_inline const char *step_into(struct nameidata *nd, int flags,
		     struct dentry *dentry)
{
	if (likely((nd->flags & LOOKUP_RCU) &&
	    !(dentry->d_flags & DCACHE_MANAGED_DENTRY) &&
	    !d_is_symlink(dentry))) {
		// Simple case: no messing with refcounts and
		// neither a symlink nor mountpoint.
		struct inode *inode = dentry->d_inode;

		if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
			return ERR_PTR(-ECHILD);
		if (unlikely(!inode))
			return ERR_PTR(-ENOENT);
		nd->path.dentry = dentry;
		nd->inode = inode;
		nd->seq = nd->next_seq;
		return NULL;
	}
	return step_into_slowpath(nd, flags, dentry);
}

for that matter?
Re: [PATCH] fs: inline step_into() and walk_component()
Posted by Mateusz Guzik 1 week, 5 days ago
On Wed, Nov 19, 2025 at 8:42 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Wed, Nov 19, 2025 at 07:40:01PM +0100, Mateusz Guzik wrote:
>
> > -static const char *pick_link(struct nameidata *nd, struct path *link,
> > +static noinline const char *pick_link(struct nameidata *nd, struct path *link,
> >                    struct inode *inode, int flags)
> >  {
> >       struct saved *last;
> >       const char *res;
> > -     int error = reserve_stack(nd, link);
> > +     int error;
> > +
> > +     if (nd->flags & LOOKUP_RCU) {
> > +             /* make sure that d_is_symlink above matches inode */
>
> Where would that "above" be?  Have some pity on the readers...
>

this is just copy pasted from the original, i'll patch it

> > +static __always_inline const char *step_into(struct nameidata *nd, int flags,
> > +                  struct dentry *dentry)
> > +{
> > +     struct path path;
> > +     struct inode *inode;
> > +
> > +     path.mnt = nd->path.mnt;
> > +     path.dentry = dentry;
> > +     if (!(nd->flags & LOOKUP_RCU))
> > +             goto slowpath;
> > +     if (unlikely(dentry->d_flags & DCACHE_MANAGED_DENTRY))
> > +             goto slowpath;
> > +     inode = path.dentry->d_inode;
> > +     if (unlikely(d_is_symlink(path.dentry)))
> > +             goto slowpath;
> > +     if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
> > +             return ERR_PTR(-ECHILD);
> > +     if (unlikely(!inode))
> > +             return ERR_PTR(-ENOENT);
> > +     nd->path = path;
> > +     nd->inode = inode;
> > +     nd->seq = nd->next_seq;
> > +     return NULL;
> > +slowpath:
> > +     return step_into_slowpath(nd, flags, dentry);
> > +}
>
> Is there any point keeping that struct path here?  Hell, what's wrong
> with
>
> static __always_inline const char *step_into(struct nameidata *nd, int flags,
>                      struct dentry *dentry)
> {
>         if (likely((nd->flags & LOOKUP_RCU) &&
>             !(dentry->d_flags & DCACHE_MANAGED_DENTRY) &&
>             !d_is_symlink(dentry))) {
>                 // Simple case: no messing with refcounts and
>                 // neither a symlink nor mountpoint.
>                 struct inode *inode = dentry->d_inode;
>
>                 if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
>                         return ERR_PTR(-ECHILD);
>                 if (unlikely(!inode))
>                         return ERR_PTR(-ENOENT);
>                 nd->path.dentry = dentry;
>                 nd->inode = inode;
>                 nd->seq = nd->next_seq;
>                 return NULL;
>         }
>         return step_into_slowpath(nd, flags, dentry);
> }
>
> for that matter?

this keeps the style of the current step_into, which I can stick with

The gotos, while uglier, imo provide easier handling should one try to
mess with the checks, but I'm not going to die on that hill.

btw, is there a way to combine DCACHE_MANAGED_DENTRY + symlink check
into one branch? The compiler refuses at the moment due to type being
a bitfield. Given how few free flags are remaining this is quite a
bummer.
Re: [PATCH] fs: inline step_into() and walk_component()
Posted by Al Viro 1 week, 5 days ago
On Wed, Nov 19, 2025 at 08:49:36PM +0100, Mateusz Guzik wrote:

> this keeps the style of the current step_into, which I can stick with
> 
> The gotos, while uglier, imo provide easier handling should one try to
> mess with the checks, but I'm not going to die on that hill.

The problem is that compiler is not the only audience to care about -
humans occasionally need to read and understand that as well, and
barrier-related logics in that area had caused quite a bit of PITA
over the years.

> btw, is there a way to combine DCACHE_MANAGED_DENTRY + symlink check
> into one branch? The compiler refuses at the moment due to type being
> a bitfield. Given how few free flags are remaining this is quite a
> bummer.

Is that really worth bothering with?  Condition is "bits 15..17 are 000
and bits 19..21 are _not_ 110" and I don't see any clean way to change
the encoding so that it could be done in one test.  In theory we could
do something like swapping the "type" and "managed" bits and use the
fact that 6 (symlink) is the maximal type we might run into, turning
that into flags & ((7<<15)|(7<<19))) < (6<<15), but...  *ewww*

If nothing else, it would
	* require us to remember that "managed" bits must be higher
than "type" ones
	* require us to remember that 7 is never to be used as type
	* require every sodding human reader to figure out what the
bleeding fuck is that test about
and I really don't believe that shaving a couple of cycles is worth
that.
Re: [PATCH] fs: inline step_into() and walk_component()
Posted by Mateusz Guzik 1 week, 5 days ago
So I think this is best split into 2 parts:
1. the tidy ups around step_into as present right now
2. implementation of the fast path + inlining of complete_walk

I did not previously mentally register that pick_link() is not
tail-callable because the address of struct path is passed as one of
the arguments. I'm going to massage it to pass dentry, mnt as 2 args
instead.

As for the fast path, I slightly tweaked your version:
static __always_inline const char *step_into(struct nameidata *nd, int flags,
                    struct dentry *dentry)
{
       if (likely((nd->flags & LOOKUP_RCU) &&
           !d_managed(dentry) && !d_is_symlink(dentry))) {
               /*
                * Simple case of not having to worry about mount
points or symlink traversal.
                */
               struct inode *inode = dentry->d_inode;
               if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
                       return ERR_PTR(-ECHILD);
               if (unlikely(!inode))
                       return ERR_PTR(-ENOENT);
               nd->path.dentry = dentry;
               /* nd->path.mnt is retained on purpose */
               nd->inode = inode;
               nd->seq = nd->next_seq;
               return NULL;
       }
       return step_into_slowpath(nd, flags, dentry);
}

There is a comment about path.mnt and use of d_managed instead of
open-coding it, the other comment is a little different, but again
that's cosmetics.

Since this is de facto your code, modulo small massaging by me (+ two
instances of __always_inline), I don't think I should be slapping my
authorship on it and maybe you prefer your version exactly as you sent
it. Alternatively, since this follows step_into() as currently
present, just lifted out and slightly massaged, I can slap my name on
this and state this is what happened.

I'll be a perfectly happy camper if you take authorship here and write
your own commit message or take mine in whatever capacity (presumably
you would want the bench + profiling info).

Just state how you want this sorted out on my end. My goal here is to
get this crapper out of the way, I don't care for some meta data on a
commit.

On Wed, Nov 19, 2025 at 9:24 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Wed, Nov 19, 2025 at 08:49:36PM +0100, Mateusz Guzik wrote:
> > btw, is there a way to combine DCACHE_MANAGED_DENTRY + symlink check
> > into one branch? The compiler refuses at the moment due to type being
> > a bitfield. Given how few free flags are remaining this is quite a
> > bummer.
>
> Is that really worth bothering with?  Condition is "bits 15..17 are 000
> and bits 19..21 are _not_ 110" and I don't see any clean way to change
> the encoding so that it could be done in one test.  In theory we could
> do something like swapping the "type" and "managed" bits and use the
> fact that 6 (symlink) is the maximal type we might run into, turning
> that into flags & ((7<<15)|(7<<19))) < (6<<15), but...  *ewww*
>
> If nothing else, it would
>         * require us to remember that "managed" bits must be higher
> than "type" ones
>         * require us to remember that 7 is never to be used as type
>         * require every sodding human reader to figure out what the
> bleeding fuck is that test about
> and I really don't believe that shaving a couple of cycles is worth
> that.
Re: [PATCH] fs: inline step_into() and walk_component()
Posted by Mateusz Guzik 1 week, 5 days ago
> On Wed, Nov 19, 2025 at 9:24 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > On Wed, Nov 19, 2025 at 08:49:36PM +0100, Mateusz Guzik wrote:
> > > btw, is there a way to combine DCACHE_MANAGED_DENTRY + symlink check
> > > into one branch? The compiler refuses at the moment due to type being
> > > a bitfield. Given how few free flags are remaining this is quite a
> > > bummer.
> >
> > Is that really worth bothering with?  Condition is "bits 15..17 are 000
> > and bits 19..21 are _not_ 110" and I don't see any clean way to change
> > the encoding so that it could be done in one test.  In theory we could
> > do something like swapping the "type" and "managed" bits and use the
> > fact that 6 (symlink) is the maximal type we might run into, turning
> > that into flags & ((7<<15)|(7<<19))) < (6<<15), but...  *ewww*
> >

Forgot to respond to this bit.

I was looking for something *trivial* and hoped I merely missed it.

I completely agree anything complicated/fancy is not warranted.
Re: [PATCH] fs: inline step_into() and walk_component()
Posted by Mateusz Guzik 1 week, 5 days ago
Attached benchmark code, ploppable into will-it-scale.

On Wed, Nov 19, 2025 at 7:40 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> The primary consumer is link_path_walk(), calling walk_component() every
> time which in turn calls step_into().
>
> Inlining these saves overhead of 2 function calls per path component,
> along with allowing the compiler to do better job optimizing them in place.
>
> step_into() had absolutely atrocious assembly to facilitate the
> slowpath. In order to lessen the burden at the callsite all the hard
> work is moved into step_into_slowpath(). This also elides some of the
> branches as for example LOOKUP_RCU is only checked once.
>
> The inline-able step_into() variant deals with the common case of
> traversing a cached non-mounted on directory while in RCU mode.
>
> Since symlink handling is already denoted as unlikely(), I took the
> opportunity to also shorten step_into_slowpath() by moving parts of it
> into pick_link() which further shortens assembly.
>
> Benchmarked as follows on Sapphire Rapids:
> 1. the "before" was a kernel with not-yet-merged optimizations (notably
>    elision of calls to security_inode_permissin() and marking ext4
>    inodes as not having acls)
> 2. "after" is the same + this patch
> 3. benchmark consists of issuing 205 calls to access(2) in a loop with
>    pathnames lifted out of gcc and the linker building real code, most
>    of which have several path components and 118 of which fail with
>    -ENOENT. Some of those do symlink traversal.
>
> In terms of ops/s:
> before: 21619
> after:  22536 (+4%)
>
> profile before:
>   20.25%  [kernel]                  [k] __d_lookup_rcu
>   10.54%  [kernel]                  [k] link_path_walk
>   10.22%  [kernel]                  [k] entry_SYSCALL_64
>    6.50%  libc.so.6                 [.] __GI___access
>    6.35%  [kernel]                  [k] strncpy_from_user
>    4.87%  [kernel]                  [k] step_into
>    3.68%  [kernel]                  [k] kmem_cache_alloc_noprof
>    2.88%  [kernel]                  [k] walk_component
>    2.86%  [kernel]                  [k] kmem_cache_free
>    2.14%  [kernel]                  [k] set_root
>    2.08%  [kernel]                  [k] lookup_fast
>
> after:
>   23.38%  [kernel]                  [k] __d_lookup_rcu
>   11.27%  [kernel]                  [k] entry_SYSCALL_64
>   10.89%  [kernel]                  [k] link_path_walk
>    7.00%  libc.so.6                 [.] __GI___access
>    6.88%  [kernel]                  [k] strncpy_from_user
>    3.50%  [kernel]                  [k] kmem_cache_alloc_noprof
>    2.01%  [kernel]                  [k] kmem_cache_free
>    2.00%  [kernel]                  [k] set_root
>    1.99%  [kernel]                  [k] lookup_fast
>    1.81%  [kernel]                  [k] do_syscall_64
>    1.69%  [kernel]                  [k] entry_SYSCALL_64_safe_stack
>
> While walk_component() and step_into() of course disappear from the
> profile, the link_path_walk() barely gets more overhead despite the
> inlining thanks to the fast path added and while completing more walks
> per second.
>
> I don't know why overhead grew a lot on __d_lookup_rcu().
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>
> perhaps this could be 2-3 patches instead to do things incrementally
> the other patches not listed in the commit message are:
> 1. mntput_not_expire slowpath (Christian took it, not present in fs-next yet)
> 2. nd->depth predicts.
> 3. lookup_slow noinline
>
> all of those sent
>
> you may notice step_into is quite high on the profile:
>    4.87%  [kernel]                  [k] step_into
>
> here is the routine prior to the patch:
>     call   ffffffff81374630 <__fentry__>
>     push   %rbp
>     mov    %rsp,%rbp
>     push   %r15
>     push   %r14
>     mov    %esi,%r14d
>     push   %r13
>     push   %r12
>     push   %rbx
>     mov    %rdi,%rbx
>     sub    $0x28,%rsp
>     mov    (%rdi),%rax
>     mov    0x38(%rdi),%r8d
>     mov    %gs:0x2e10acd(%rip),%r12        # ffffffff84551008 <__stack_chk_guard>
>
>     mov    %r12,0x20(%rsp)
>     mov    %rdx,%r12
>     mov    %rdx,0x18(%rsp)
>     mov    %rax,0x10(%rsp)
>
> This is setup before it even gets to do anything which of course has
> to be undone later. Also note the stackguard check. I'm not pasting
> everything you can disasm yourself to check the entire monstrosity.
>
> In contrast this is entirety of step_into() fast path with the patch + noinline:
>          call   ffffffff81374630 <__fentry__>
>          testb  $0x1,0x39(%rdi)
>          je     ffffffff81740cbe <step_into+0x4e>
>          mov    (%rdx),%eax
>          test   $0x38000,%eax
>          jne    ffffffff81740cbe <step_into+0x4e>
>          and    $0x380000,%eax
>          mov    0x30(%rdx),%rcx
>          cmp    $0x300000,%eax
>          je     ffffffff81740cbe <step_into+0x4e>
>          mov    (%rdi),%r8
>          mov    0x44(%rdi),%esi
>          mov    0x4(%rdx),%eax
>          cmp    %eax,%esi
>          jne    ffffffff81740cc3 <step_into+0x53>
>          test   %rcx,%rcx
>          je     ffffffff81740ccf <step_into+0x5f>
>          mov    0x44(%rdi),%eax
>          mov    %r8,(%rdi)
>          mov    %rdx,0x8(%rdi)
>          mov    %eax,0x40(%rdi)
>          xor    %eax,%eax
>          mov    %rcx,0x30(%rdi)
>          jmp    ffffffff8230b1f0 <__pi___x86_return_thunk>
>
> This possibly can be shortened but I have not tried yet.
>
>  fs/namei.c | 67 ++++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 50 insertions(+), 17 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 1d1f864ad6ad..e00b9ce21536 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -1668,17 +1668,17 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
>         bool jumped;
>         int ret;
>
> -       path->mnt = nd->path.mnt;
> -       path->dentry = dentry;
>         if (nd->flags & LOOKUP_RCU) {
>                 unsigned int seq = nd->next_seq;
> +               if (likely(!(dentry->d_flags & DCACHE_MANAGED_DENTRY)))
> +                       return 0;
>                 if (likely(__follow_mount_rcu(nd, path)))
>                         return 0;
>                 // *path and nd->next_seq might've been clobbered
>                 path->mnt = nd->path.mnt;
>                 path->dentry = dentry;
>                 nd->next_seq = seq;
> -               if (!try_to_unlazy_next(nd, dentry))
> +               if (unlikely(!try_to_unlazy_next(nd, dentry)))
>                         return -ECHILD;
>         }
>         ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
> @@ -1941,12 +1941,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link)
>
>  enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
>
> -static const char *pick_link(struct nameidata *nd, struct path *link,
> +static noinline const char *pick_link(struct nameidata *nd, struct path *link,
>                      struct inode *inode, int flags)
>  {
>         struct saved *last;
>         const char *res;
> -       int error = reserve_stack(nd, link);
> +       int error;
> +
> +       if (nd->flags & LOOKUP_RCU) {
> +               /* make sure that d_is_symlink above matches inode */
> +               if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
> +                       return ERR_PTR(-ECHILD);
> +       } else {
> +               if (link->mnt == nd->path.mnt)
> +                       mntget(link->mnt);
> +       }
> +
> +       error = reserve_stack(nd, link);
>
>         if (unlikely(error)) {
>                 if (!(nd->flags & LOOKUP_RCU))
> @@ -2021,14 +2032,17 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
>   *
>   * NOTE: dentry must be what nd->next_seq had been sampled from.
>   */
> -static const char *step_into(struct nameidata *nd, int flags,
> +static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
>                      struct dentry *dentry)
>  {
>         struct path path;
>         struct inode *inode;
> -       int err = handle_mounts(nd, dentry, &path);
> +       int err;
>
> -       if (err < 0)
> +       path.mnt = nd->path.mnt;
> +       path.dentry = dentry;
> +       err = handle_mounts(nd, dentry, &path);
> +       if (unlikely(err < 0))
>                 return ERR_PTR(err);
>         inode = path.dentry->d_inode;
>         if (likely(!d_is_symlink(path.dentry)) ||
> @@ -2050,17 +2064,36 @@ static const char *step_into(struct nameidata *nd, int flags,
>                 nd->seq = nd->next_seq;
>                 return NULL;
>         }
> -       if (nd->flags & LOOKUP_RCU) {
> -               /* make sure that d_is_symlink above matches inode */
> -               if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
> -                       return ERR_PTR(-ECHILD);
> -       } else {
> -               if (path.mnt == nd->path.mnt)
> -                       mntget(path.mnt);
> -       }
>         return pick_link(nd, &path, inode, flags);
>  }
>
> +static __always_inline const char *step_into(struct nameidata *nd, int flags,
> +                    struct dentry *dentry)
> +{
> +       struct path path;
> +       struct inode *inode;
> +
> +       path.mnt = nd->path.mnt;
> +       path.dentry = dentry;
> +       if (!(nd->flags & LOOKUP_RCU))
> +               goto slowpath;
> +       if (unlikely(dentry->d_flags & DCACHE_MANAGED_DENTRY))
> +               goto slowpath;
> +       inode = path.dentry->d_inode;
> +       if (unlikely(d_is_symlink(path.dentry)))
> +               goto slowpath;
> +       if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
> +               return ERR_PTR(-ECHILD);
> +       if (unlikely(!inode))
> +               return ERR_PTR(-ENOENT);
> +       nd->path = path;
> +       nd->inode = inode;
> +       nd->seq = nd->next_seq;
> +       return NULL;
> +slowpath:
> +       return step_into_slowpath(nd, flags, dentry);
> +}
> +
>  static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
>  {
>         struct dentry *parent, *old;
> @@ -2171,7 +2204,7 @@ static const char *handle_dots(struct nameidata *nd, int type)
>         return NULL;
>  }
>
> -static const char *walk_component(struct nameidata *nd, int flags)
> +static __always_inline const char *walk_component(struct nameidata *nd, int flags)
>  {
>         struct dentry *dentry;
>         /*
> --
> 2.48.1
>