[PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC

Mateusz Guzik posted 3 patches 1 month, 1 week ago
[PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC
Posted by Mateusz Guzik 1 month, 1 week ago
The generic inode_permission() routine does work which is known to be of
no significance for lookup. There are checks for MAY_WRITE, while the
requested permission is MAY_EXEC. Additionally devcgroup_inode_permission()
is called to check for devices, but it is an invariant the inode is a
directory.

Absent a ->permission func, execution lands in generic_permission()
which checks upfront if the requested permission is granted for
everyone.

We can elide the branches which are guaranteed to be false and cut
straight to the check if everyone happens to be allowed MAY_EXEC on the
inode (which holds true most of the time).

Moreover, filesystems which provide their own ->permission routine can
take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC
flag on their inodes, which they can legitimately do if their MAY_EXEC
handling matches generic_permission().

As a simple benchmark, as part of compilation gcc issues access(2) on
numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o

Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s):
before: 3797556
after:  3987789 (+5%)

Note: this depends on the not-yet-landed ext4 patch to mark inodes with
cache_no_acl()

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 fs/namei.c         | 43 +++++++++++++++++++++++++++++++++++++++++--
 include/linux/fs.h | 13 +++++++------
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index a9f9d0453425..6b2a5a5478e7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -540,6 +540,9 @@ static inline int do_inode_permission(struct mnt_idmap *idmap,
  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  *
  * Separate out file-system wide checks from inode-specific permission checks.
+ *
+ * Note: lookup_inode_permission_may_exec() does not call here. If you add
+ * MAY_EXEC checks, adjust it.
  */
 static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
 {
@@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap,
 }
 EXPORT_SYMBOL(inode_permission);
 
+/**
+ * lookup_inode_permission_may_exec - Check traversal right for given inode
+ *
+ * This is a special case routine for may_lookup() making assumptions specific
+ * to path traversal. Use inode_permission() if you are doing something else.
+ *
+ * Work is shaved off compared to inode_permission() as follows:
+ * - we know for a fact there is no MAY_WRITE to worry about
+ * - it is an invariant the inode is a directory
+ *
+ * Since majority of real-world traversal happens on inodes which grant it for
+ * everyone, we check it upfront and only resort to more expensive work if it
+ * fails.
+ *
+ * Filesystems which have their own ->permission hook and consequently miss out
+ * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
+ * on their directory inodes.
+ */
+static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
+	struct inode *inode, int mask)
+{
+	/* Lookup already checked this to return -ENOTDIR */
+	VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
+	VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
+
+	mask |= MAY_EXEC;
+
+	if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
+		return inode_permission(idmap, inode, mask);
+
+	if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
+		return inode_permission(idmap, inode, mask);
+
+	return security_inode_permission(inode, mask);
+}
+
 /**
  * path_get - get a reference to a path
  * @path: path to get the reference to
@@ -1855,7 +1894,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
 	int err, mask;
 
 	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
-	err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
+	err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
 	if (likely(!err))
 		return 0;
 
@@ -1870,7 +1909,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
 	if (err != -ECHILD)	// hard error
 		return err;
 
-	return inode_permission(idmap, nd->inode, MAY_EXEC);
+	return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
 }
 
 static int reserve_stack(struct nameidata *nd, struct path *link)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 03e450dd5211..7d5de647ac7b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -647,13 +647,14 @@ is_uncached_acl(struct posix_acl *acl)
 	return (long)acl & 1;
 }
 
-#define IOP_FASTPERM	0x0001
-#define IOP_LOOKUP	0x0002
-#define IOP_NOFOLLOW	0x0004
-#define IOP_XATTR	0x0008
+#define IOP_FASTPERM		0x0001
+#define IOP_LOOKUP		0x0002
+#define IOP_NOFOLLOW		0x0004
+#define IOP_XATTR		0x0008
 #define IOP_DEFAULT_READLINK	0x0010
-#define IOP_MGTIME	0x0020
-#define IOP_CACHED_LINK	0x0040
+#define IOP_MGTIME		0x0020
+#define IOP_CACHED_LINK		0x0040
+#define IOP_FASTPERM_MAY_EXEC	0x0080
 
 /*
  * Inode state bits.  Protected by inode->i_lock
-- 
2.48.1
Re: [PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC
Posted by Christian Brauner 1 month, 1 week ago
On Fri, Nov 07, 2025 at 03:21:47PM +0100, Mateusz Guzik wrote:
> The generic inode_permission() routine does work which is known to be of
> no significance for lookup. There are checks for MAY_WRITE, while the
> requested permission is MAY_EXEC. Additionally devcgroup_inode_permission()
> is called to check for devices, but it is an invariant the inode is a
> directory.
> 
> Absent a ->permission func, execution lands in generic_permission()
> which checks upfront if the requested permission is granted for
> everyone.
> 
> We can elide the branches which are guaranteed to be false and cut
> straight to the check if everyone happens to be allowed MAY_EXEC on the
> inode (which holds true most of the time).
> 
> Moreover, filesystems which provide their own ->permission routine can
> take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC
> flag on their inodes, which they can legitimately do if their MAY_EXEC
> handling matches generic_permission().
> 
> As a simple benchmark, as part of compilation gcc issues access(2) on
> numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o
> 
> Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s):
> before: 3797556
> after:  3987789 (+5%)
> 
> Note: this depends on the not-yet-landed ext4 patch to mark inodes with
> cache_no_acl()
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>  fs/namei.c         | 43 +++++++++++++++++++++++++++++++++++++++++--
>  include/linux/fs.h | 13 +++++++------
>  2 files changed, 48 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index a9f9d0453425..6b2a5a5478e7 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -540,6 +540,9 @@ static inline int do_inode_permission(struct mnt_idmap *idmap,
>   * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
>   *
>   * Separate out file-system wide checks from inode-specific permission checks.
> + *
> + * Note: lookup_inode_permission_may_exec() does not call here. If you add
> + * MAY_EXEC checks, adjust it.
>   */
>  static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
>  {
> @@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap,
>  }
>  EXPORT_SYMBOL(inode_permission);
>  
> +/**
> + * lookup_inode_permission_may_exec - Check traversal right for given inode
> + *
> + * This is a special case routine for may_lookup() making assumptions specific
> + * to path traversal. Use inode_permission() if you are doing something else.
> + *
> + * Work is shaved off compared to inode_permission() as follows:
> + * - we know for a fact there is no MAY_WRITE to worry about
> + * - it is an invariant the inode is a directory
> + *
> + * Since majority of real-world traversal happens on inodes which grant it for
> + * everyone, we check it upfront and only resort to more expensive work if it
> + * fails.
> + *
> + * Filesystems which have their own ->permission hook and consequently miss out
> + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
> + * on their directory inodes.
> + */
> +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
> +	struct inode *inode, int mask)
> +{
> +	/* Lookup already checked this to return -ENOTDIR */
> +	VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
> +	VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
> +
> +	mask |= MAY_EXEC;
> +
> +	if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
> +		return inode_permission(idmap, inode, mask);
> +
> +	if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))

Can you send a follow-up where 0111 is a constant with some descriptive
name, please? Can be local to the file. I hate these raw-coded
permission masks with a passion.

> +		return inode_permission(idmap, inode, mask);
> +
> +	return security_inode_permission(inode, mask);
> +}
> +
>  /**
>   * path_get - get a reference to a path
>   * @path: path to get the reference to
> @@ -1855,7 +1894,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
>  	int err, mask;
>  
>  	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
> -	err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
> +	err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
>  	if (likely(!err))
>  		return 0;
>  
> @@ -1870,7 +1909,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
>  	if (err != -ECHILD)	// hard error
>  		return err;
>  
> -	return inode_permission(idmap, nd->inode, MAY_EXEC);
> +	return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
>  }
>  
>  static int reserve_stack(struct nameidata *nd, struct path *link)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 03e450dd5211..7d5de647ac7b 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -647,13 +647,14 @@ is_uncached_acl(struct posix_acl *acl)
>  	return (long)acl & 1;
>  }
>  
> -#define IOP_FASTPERM	0x0001
> -#define IOP_LOOKUP	0x0002
> -#define IOP_NOFOLLOW	0x0004
> -#define IOP_XATTR	0x0008
> +#define IOP_FASTPERM		0x0001
> +#define IOP_LOOKUP		0x0002
> +#define IOP_NOFOLLOW		0x0004
> +#define IOP_XATTR		0x0008
>  #define IOP_DEFAULT_READLINK	0x0010
> -#define IOP_MGTIME	0x0020
> -#define IOP_CACHED_LINK	0x0040
> +#define IOP_MGTIME		0x0020
> +#define IOP_CACHED_LINK		0x0040
> +#define IOP_FASTPERM_MAY_EXEC	0x0080
>  
>  /*
>   * Inode state bits.  Protected by inode->i_lock
> -- 
> 2.48.1
>
Re: [PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC
Posted by Mateusz Guzik 1 month, 1 week ago
On Tue, Nov 11, 2025 at 10:41 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Fri, Nov 07, 2025 at 03:21:47PM +0100, Mateusz Guzik wrote:
> > +     if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
>
> Can you send a follow-up where 0111 is a constant with some descriptive
> name, please? Can be local to the file. I hate these raw-coded
> permission masks with a passion.
>

#define UNIX_PERM_ALL_X 0111?

I have no opinion about hardcoding this vs using a macro, but don't
have a good name for that one either.
Re: [PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC
Posted by Mateusz Guzik 1 month, 1 week ago
On Tue, Nov 11, 2025 at 11:51 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> On Tue, Nov 11, 2025 at 10:41 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Fri, Nov 07, 2025 at 03:21:47PM +0100, Mateusz Guzik wrote:
> > > +     if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
> >
> > Can you send a follow-up where 0111 is a constant with some descriptive
> > name, please? Can be local to the file. I hate these raw-coded
> > permission masks with a passion.
> >
>
> #define UNIX_PERM_ALL_X 0111?
>
> I have no opinion about hardcoding this vs using a macro, but don't
> have a good name for that one either.

Apart from usage added by me here there is:

fs/coredump.c:          if
((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
fs/namei.c:      *  - multiplying by 0111 spreads them out to all of ugo
fs/namei.c:     if (!((mask & 7) * 0111 & ~mode)) {

That's ignoring other spots which definitely want 0111 spelled out in
per-fs code.

I would argue the other 2 in namei.c want this spelled out numerically as well:

          │*  - 'mask&7' is the requested permission bit set
          │*  - multiplying by 0111 spreads them out to all of ugo
          │*  - '& ~mode' looks for missing inode permission bits
          │*  - the '!' is for "no missing permissions"
[snip]
          if (!((mask & 7) * 0111 & ~mode)) {

But then it may make sense to keep this numerical in the new code as
well so that anyone looking at lookup_inode_permission_may_exec() and
inode_permission()->generic_permission()->acl_permission_check() can
see it's the same thing.

I figured maybe a comment would do the trick above the 0111 usage, but
the commentary added at the top of the func imo covers it:
   * Since majority of real-world traversal happens on inodes which
grant it for
   * everyone, we check it upfront and only resort to more expensive
work if it
   * fails.

All that said, now that I look at it, I think the code is best left
off with spelled out 0111 in place so I wont be submitting a patch to
change that.

Given that hiding it behind some name or adding a comment is a trivial
edit, I don't think it's much of a burden for you to do it should you
chose to make such a change anyway.
Re: [PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC
Posted by Jan Kara 1 month, 1 week ago
On Fri 07-11-25 15:21:47, Mateusz Guzik wrote:
> The generic inode_permission() routine does work which is known to be of
> no significance for lookup. There are checks for MAY_WRITE, while the
> requested permission is MAY_EXEC. Additionally devcgroup_inode_permission()
> is called to check for devices, but it is an invariant the inode is a
> directory.
> 
> Absent a ->permission func, execution lands in generic_permission()
> which checks upfront if the requested permission is granted for
> everyone.
> 
> We can elide the branches which are guaranteed to be false and cut
> straight to the check if everyone happens to be allowed MAY_EXEC on the
> inode (which holds true most of the time).
> 
> Moreover, filesystems which provide their own ->permission routine can
> take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC
> flag on their inodes, which they can legitimately do if their MAY_EXEC
> handling matches generic_permission().
> 
> As a simple benchmark, as part of compilation gcc issues access(2) on
> numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o
> 
> Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s):
> before: 3797556
> after:  3987789 (+5%)
> 
> Note: this depends on the not-yet-landed ext4 patch to mark inodes with
> cache_no_acl()
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>

The gain is nice. I'm just wondering where exactly is it coming from? I
don't see that we'd be saving some memory load or significant amount of
work. So is it really coming from the more compact code and saved several
unlikely branches and function calls?

								Honza

> ---
>  fs/namei.c         | 43 +++++++++++++++++++++++++++++++++++++++++--
>  include/linux/fs.h | 13 +++++++------
>  2 files changed, 48 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index a9f9d0453425..6b2a5a5478e7 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -540,6 +540,9 @@ static inline int do_inode_permission(struct mnt_idmap *idmap,
>   * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
>   *
>   * Separate out file-system wide checks from inode-specific permission checks.
> + *
> + * Note: lookup_inode_permission_may_exec() does not call here. If you add
> + * MAY_EXEC checks, adjust it.
>   */
>  static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
>  {
> @@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap,
>  }
>  EXPORT_SYMBOL(inode_permission);
>  
> +/**
> + * lookup_inode_permission_may_exec - Check traversal right for given inode
> + *
> + * This is a special case routine for may_lookup() making assumptions specific
> + * to path traversal. Use inode_permission() if you are doing something else.
> + *
> + * Work is shaved off compared to inode_permission() as follows:
> + * - we know for a fact there is no MAY_WRITE to worry about
> + * - it is an invariant the inode is a directory
> + *
> + * Since majority of real-world traversal happens on inodes which grant it for
> + * everyone, we check it upfront and only resort to more expensive work if it
> + * fails.
> + *
> + * Filesystems which have their own ->permission hook and consequently miss out
> + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
> + * on their directory inodes.
> + */
> +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
> +	struct inode *inode, int mask)
> +{
> +	/* Lookup already checked this to return -ENOTDIR */
> +	VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
> +	VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
> +
> +	mask |= MAY_EXEC;
> +
> +	if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
> +		return inode_permission(idmap, inode, mask);
> +
> +	if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
> +		return inode_permission(idmap, inode, mask);
> +
> +	return security_inode_permission(inode, mask);
> +}
> +
>  /**
>   * path_get - get a reference to a path
>   * @path: path to get the reference to
> @@ -1855,7 +1894,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
>  	int err, mask;
>  
>  	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
> -	err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
> +	err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
>  	if (likely(!err))
>  		return 0;
>  
> @@ -1870,7 +1909,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
>  	if (err != -ECHILD)	// hard error
>  		return err;
>  
> -	return inode_permission(idmap, nd->inode, MAY_EXEC);
> +	return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
>  }
>  
>  static int reserve_stack(struct nameidata *nd, struct path *link)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 03e450dd5211..7d5de647ac7b 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -647,13 +647,14 @@ is_uncached_acl(struct posix_acl *acl)
>  	return (long)acl & 1;
>  }
>  
> -#define IOP_FASTPERM	0x0001
> -#define IOP_LOOKUP	0x0002
> -#define IOP_NOFOLLOW	0x0004
> -#define IOP_XATTR	0x0008
> +#define IOP_FASTPERM		0x0001
> +#define IOP_LOOKUP		0x0002
> +#define IOP_NOFOLLOW		0x0004
> +#define IOP_XATTR		0x0008
>  #define IOP_DEFAULT_READLINK	0x0010
> -#define IOP_MGTIME	0x0020
> -#define IOP_CACHED_LINK	0x0040
> +#define IOP_MGTIME		0x0020
> +#define IOP_CACHED_LINK		0x0040
> +#define IOP_FASTPERM_MAY_EXEC	0x0080
>  
>  /*
>   * Inode state bits.  Protected by inode->i_lock
> -- 
> 2.48.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Re: [PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC
Posted by Mateusz Guzik 1 month, 1 week ago
On Mon, Nov 10, 2025 at 10:32 AM Jan Kara <jack@suse.cz> wrote:
>
> On Fri 07-11-25 15:21:47, Mateusz Guzik wrote:
> > The generic inode_permission() routine does work which is known to be of
> > no significance for lookup. There are checks for MAY_WRITE, while the
> > requested permission is MAY_EXEC. Additionally devcgroup_inode_permission()
> > is called to check for devices, but it is an invariant the inode is a
> > directory.
> >
> > Absent a ->permission func, execution lands in generic_permission()
> > which checks upfront if the requested permission is granted for
> > everyone.
> >
> > We can elide the branches which are guaranteed to be false and cut
> > straight to the check if everyone happens to be allowed MAY_EXEC on the
> > inode (which holds true most of the time).
> >
> > Moreover, filesystems which provide their own ->permission routine can
> > take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC
> > flag on their inodes, which they can legitimately do if their MAY_EXEC
> > handling matches generic_permission().
> >
> > As a simple benchmark, as part of compilation gcc issues access(2) on
> > numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o
> >
> > Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s):
> > before: 3797556
> > after:  3987789 (+5%)
> >
> > Note: this depends on the not-yet-landed ext4 patch to mark inodes with
> > cache_no_acl()
> >
> > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
>
> The gain is nice. I'm just wondering where exactly is it coming from? I
> don't see that we'd be saving some memory load or significant amount of
> work. So is it really coming from the more compact code and saved several
> unlikely branches and function calls?
>

That's several branches and 2 function calls per path component on the
way to the terminal inode. In the path at hand, that's 10 function
calls elided.

>                                                                 Honza
>
> > ---
> >  fs/namei.c         | 43 +++++++++++++++++++++++++++++++++++++++++--
> >  include/linux/fs.h | 13 +++++++------
> >  2 files changed, 48 insertions(+), 8 deletions(-)
> >
> > diff --git a/fs/namei.c b/fs/namei.c
> > index a9f9d0453425..6b2a5a5478e7 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -540,6 +540,9 @@ static inline int do_inode_permission(struct mnt_idmap *idmap,
> >   * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
> >   *
> >   * Separate out file-system wide checks from inode-specific permission checks.
> > + *
> > + * Note: lookup_inode_permission_may_exec() does not call here. If you add
> > + * MAY_EXEC checks, adjust it.
> >   */
> >  static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
> >  {
> > @@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap,
> >  }
> >  EXPORT_SYMBOL(inode_permission);
> >
> > +/**
> > + * lookup_inode_permission_may_exec - Check traversal right for given inode
> > + *
> > + * This is a special case routine for may_lookup() making assumptions specific
> > + * to path traversal. Use inode_permission() if you are doing something else.
> > + *
> > + * Work is shaved off compared to inode_permission() as follows:
> > + * - we know for a fact there is no MAY_WRITE to worry about
> > + * - it is an invariant the inode is a directory
> > + *
> > + * Since majority of real-world traversal happens on inodes which grant it for
> > + * everyone, we check it upfront and only resort to more expensive work if it
> > + * fails.
> > + *
> > + * Filesystems which have their own ->permission hook and consequently miss out
> > + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
> > + * on their directory inodes.
> > + */
> > +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
> > +     struct inode *inode, int mask)
> > +{
> > +     /* Lookup already checked this to return -ENOTDIR */
> > +     VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
> > +     VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
> > +
> > +     mask |= MAY_EXEC;
> > +
> > +     if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
> > +             return inode_permission(idmap, inode, mask);
> > +
> > +     if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
> > +             return inode_permission(idmap, inode, mask);
> > +
> > +     return security_inode_permission(inode, mask);
> > +}
> > +
> >  /**
> >   * path_get - get a reference to a path
> >   * @path: path to get the reference to
> > @@ -1855,7 +1894,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
> >       int err, mask;
> >
> >       mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
> > -     err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
> > +     err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
> >       if (likely(!err))
> >               return 0;
> >
> > @@ -1870,7 +1909,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
> >       if (err != -ECHILD)     // hard error
> >               return err;
> >
> > -     return inode_permission(idmap, nd->inode, MAY_EXEC);
> > +     return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
> >  }
> >
> >  static int reserve_stack(struct nameidata *nd, struct path *link)
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index 03e450dd5211..7d5de647ac7b 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -647,13 +647,14 @@ is_uncached_acl(struct posix_acl *acl)
> >       return (long)acl & 1;
> >  }
> >
> > -#define IOP_FASTPERM 0x0001
> > -#define IOP_LOOKUP   0x0002
> > -#define IOP_NOFOLLOW 0x0004
> > -#define IOP_XATTR    0x0008
> > +#define IOP_FASTPERM         0x0001
> > +#define IOP_LOOKUP           0x0002
> > +#define IOP_NOFOLLOW         0x0004
> > +#define IOP_XATTR            0x0008
> >  #define IOP_DEFAULT_READLINK 0x0010
> > -#define IOP_MGTIME   0x0020
> > -#define IOP_CACHED_LINK      0x0040
> > +#define IOP_MGTIME           0x0020
> > +#define IOP_CACHED_LINK              0x0040
> > +#define IOP_FASTPERM_MAY_EXEC        0x0080
> >
> >  /*
> >   * Inode state bits.  Protected by inode->i_lock
> > --
> > 2.48.1
> >
> --
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Re: [PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC
Posted by Jan Kara 1 month, 1 week ago
On Mon 10-11-25 10:46:38, Mateusz Guzik wrote:
> On Mon, Nov 10, 2025 at 10:32 AM Jan Kara <jack@suse.cz> wrote:
> >
> > On Fri 07-11-25 15:21:47, Mateusz Guzik wrote:
> > > The generic inode_permission() routine does work which is known to be of
> > > no significance for lookup. There are checks for MAY_WRITE, while the
> > > requested permission is MAY_EXEC. Additionally devcgroup_inode_permission()
> > > is called to check for devices, but it is an invariant the inode is a
> > > directory.
> > >
> > > Absent a ->permission func, execution lands in generic_permission()
> > > which checks upfront if the requested permission is granted for
> > > everyone.
> > >
> > > We can elide the branches which are guaranteed to be false and cut
> > > straight to the check if everyone happens to be allowed MAY_EXEC on the
> > > inode (which holds true most of the time).
> > >
> > > Moreover, filesystems which provide their own ->permission routine can
> > > take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC
> > > flag on their inodes, which they can legitimately do if their MAY_EXEC
> > > handling matches generic_permission().
> > >
> > > As a simple benchmark, as part of compilation gcc issues access(2) on
> > > numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o
> > >
> > > Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s):
> > > before: 3797556
> > > after:  3987789 (+5%)
> > >
> > > Note: this depends on the not-yet-landed ext4 patch to mark inodes with
> > > cache_no_acl()
> > >
> > > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> >
> > The gain is nice. I'm just wondering where exactly is it coming from? I
> > don't see that we'd be saving some memory load or significant amount of
> > work. So is it really coming from the more compact code and saved several
> > unlikely branches and function calls?
> 
> That's several branches and 2 function calls per path component on the
> way to the terminal inode. In the path at hand, that's 10 function
> calls elided.

OK, the path lookup is really light so I guess 10 function calls are visible
enough. I guess this is hot enough path that the microoptimization is worth
the code duplication. So feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Re: [PATCH v3 1/3] fs: speed up path lookup with cheaper handling of MAY_EXEC
Posted by Mateusz Guzik 1 month, 1 week ago
On Mon, Nov 10, 2025 at 11:13 AM Jan Kara <jack@suse.cz> wrote:
> OK, the path lookup is really light

I would not go that far ;)

The current code has function calls which can be either inlined or elided.

More importantly it is a massive branch-fest, notably with repeated
LOOKUP_RCU checks.

Based on my work on the same stuff $elsewhere, most of the time the
entry in the cache is there and is a directory you can traverse
through and which is not mounted on.

While there is a bunch of likely/unlikely usage to help out, the code
is not structured in a way which allows for easy use of it. Instead
some of the branches are repeated or have to be present to begin with.

Ideally lookup could roll forward over a pathname without function
calls as long as fast path conditions hold. You would still need to
pay to check permissions and that this is a non-mounted directory for
every path component, but some of this can be combined. Per the above,
the repeated LOOKUP_RCU checks would be whacked. Checking if this is a
directory which got mounted on *OR* is it a symlink could be one
branch and so on.

On path parsing side, userspace could have passed something fucky like
foo/////bar and this of course needs to be handled but it does not
require the current ugliness to do so. This does happen with real
programs (typically two slashes in a row), but is also constitutes a
small minority of paths. The current code makes sure to skip the
spurious slashes before looking up the name.

My code $elsewhere instead notes it is an invariant that a name
containing a slash cannot appear in the cache so it just goes forward
with the lookup. If an entry is found, the name could not have started
with / and the check is elided (common case). Should the entry be
missing then indeed we check if slashes need to get rolled over.

And so on.

I think I can incrementally reduce a bunch of overhead, but it will
always be leaving some perf on the table unless restructured.

As for some profiling of the state, I booted up a kernel with all of
my patches (including an extra to elide security_inode_permission) +
sheaves and perf top'ed over a testcase which consists of series of
access(2) calls lifted from strace on gcc and the linker. To the tune
of 205 paths, some of them repeated and several deranged -- for
example:
        access("/usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/lib/x86_64-linux-gnu/12/Scrt1.o",
R_OK);
        access("/usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/lib/x86_64-linux-gnu/Scrt1.o",
R_OK);
        access("/usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/lib/../lib/Scrt1.o",
R_OK);

The file is attached for interested.

The profile:
  20.43%  [kernel]                  [k] __d_lookup_rcu
  10.66%  [kernel]                  [k] entry_SYSCALL_64
   9.50%  [kernel]                  [k] link_path_walk
   6.98%  libc.so.6                 [.] __GI___access
   6.04%  [kernel]                  [k] strncpy_from_user
   4.81%  [kernel]                  [k] step_into
   3.36%  [kernel]                  [k] kmem_cache_alloc_noprof
   2.80%  [kernel]                  [k] kmem_cache_free
   2.77%  [kernel]                  [k] walk_component
   2.18%  [kernel]                  [k] lookup_fast
   1.83%  [kernel]                  [k] set_root
   1.83%  [kernel]                  [k] do_syscall_64
   1.65%  [kernel]                  [k] getname_flags.part.0
   1.57%  [kernel]                  [k] entry_SYSCALL_64_safe_stack
   1.52%  [kernel]                  [k] nd_jump_root
   1.48%  [kernel]                  [k] filename_lookup
   1.34%  [kernel]                  [k] path_init
   1.33%  [kernel]                  [k] do_faccessat
   1.23%  [kernel]                  [k] __legitimize_mnt
   1.23%  [kernel]                  [k] lockref_get_not_dead
   0.96%  [kernel]                  [k] path_lookupat
   0.92%  [kernel]                  [k] lockref_put_return
   0.86%  [kernel]                  [k] its_return_thunk
   0.83%  [kernel]                  [k] entry_SYSCALL_64_after_hwframe
   0.80%  [kernel]                  [k] map_id_range_down
   0.68%  [kernel]                  [k] user_path_at