[PATCH v3 1/3] vfs: support caching symlink lengths in inodes

Mateusz Guzik posted 3 patches 1 day, 5 hours ago
[PATCH v3 1/3] vfs: support caching symlink lengths in inodes
Posted by Mateusz Guzik 1 day, 5 hours ago
When utilized it dodges strlen() in vfs_readlink(), giving about 1.5%
speed up when issuing readlink on /initrd.img on ext4.

Filesystems opt in by calling inode_set_cached_link() when creating an
inode.

The size is stored in a new union utilizing the same space as i_devices,
thus avoiding growing the struct or taking up any more space.

Churn-wise the current readlink_copy() helper is patched to accept the
size instead of calculating it.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 fs/namei.c                     | 34 +++++++++++++++++++---------------
 fs/proc/namespaces.c           |  2 +-
 include/linux/fs.h             | 15 +++++++++++++--
 security/apparmor/apparmorfs.c |  2 +-
 4 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 9d30c7aa9aa6..e56c29a22d26 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
 				getname(newname), 0);
 }
 
-int readlink_copy(char __user *buffer, int buflen, const char *link)
+int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
 {
-	int len = PTR_ERR(link);
-	if (IS_ERR(link))
-		goto out;
+	int copylen;
 
-	len = strlen(link);
-	if (len > (unsigned) buflen)
-		len = buflen;
-	if (copy_to_user(buffer, link, len))
-		len = -EFAULT;
-out:
-	return len;
+	copylen = linklen;
+	if (unlikely(copylen > (unsigned) buflen))
+		copylen = buflen;
+	if (copy_to_user(buffer, link, copylen))
+		copylen = -EFAULT;
+	return copylen;
 }
 
 /**
@@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	const char *link;
 	int res;
 
+	if (inode->i_opflags & IOP_CACHED_LINK)
+		return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
+
 	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
 		if (unlikely(inode->i_op->readlink))
 			return inode->i_op->readlink(dentry, buffer, buflen);
@@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 		if (IS_ERR(link))
 			return PTR_ERR(link);
 	}
-	res = readlink_copy(buffer, buflen, link);
+	res = readlink_copy(buffer, buflen, link, strlen(link));
 	do_delayed_call(&done);
 	return res;
 }
@@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link);
 
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
+	const char *link;
+	int res;
+
 	DEFINE_DELAYED_CALL(done);
-	int res = readlink_copy(buffer, buflen,
-				page_get_link(dentry, d_inode(dentry),
-					      &done));
+	link = page_get_link(dentry, d_inode(dentry), &done);
+	res = PTR_ERR(link);
+	if (!IS_ERR(link))
+		res = readlink_copy(buffer, buflen, link, strlen(link));
 	do_delayed_call(&done);
 	return res;
 }
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 8e159fc78c0a..c610224faf10 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		res = ns_get_name(name, sizeof(name), task, ns_ops);
 		if (res >= 0)
-			res = readlink_copy(buffer, buflen, name);
+			res = readlink_copy(buffer, buflen, name, strlen(name));
 	}
 	put_task_struct(task);
 	return res;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e29433c5ecc..2cc98de5af43 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -626,6 +626,7 @@ is_uncached_acl(struct posix_acl *acl)
 #define IOP_XATTR	0x0008
 #define IOP_DEFAULT_READLINK	0x0010
 #define IOP_MGTIME	0x0020
+#define IOP_CACHED_LINK	0x0040
 
 /*
  * Keep mostly read-only and often accessed (especially for
@@ -723,7 +724,10 @@ struct inode {
 	};
 	struct file_lock_context	*i_flctx;
 	struct address_space	i_data;
-	struct list_head	i_devices;
+	union {
+		struct list_head	i_devices;
+		int			i_linklen;
+	};
 	union {
 		struct pipe_inode_info	*i_pipe;
 		struct cdev		*i_cdev;
@@ -749,6 +753,13 @@ struct inode {
 	void			*i_private; /* fs or device private pointer */
 } __randomize_layout;
 
+static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
+{
+	inode->i_link = link;
+	inode->i_linklen = linklen;
+	inode->i_opflags |= IOP_CACHED_LINK;
+}
+
 /*
  * Get bit address from inode->i_state to use with wait_var_event()
  * infrastructre.
@@ -3351,7 +3362,7 @@ extern const struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 
-extern int readlink_copy(char __user *, int, const char *);
+extern int readlink_copy(char __user *, int, const char *, int);
 extern int page_readlink(struct dentry *, char __user *, int);
 extern const char *page_get_link(struct dentry *, struct inode *,
 				 struct delayed_call *);
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 01b923d97a44..60959cfba672 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -2611,7 +2611,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer,
 	res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME,
 		       d_inode(dentry)->i_ino);
 	if (res > 0 && res < sizeof(name))
-		res = readlink_copy(buffer, buflen, name);
+		res = readlink_copy(buffer, buflen, name, strlen(name));
 	else
 		res = -ENOENT;
 
-- 
2.43.0
Re: [PATCH v3 1/3] vfs: support caching symlink lengths in inodes
Posted by Christian Brauner 6 hours ago
On Wed, Nov 20, 2024 at 12:20:34PM +0100, Mateusz Guzik wrote:
> When utilized it dodges strlen() in vfs_readlink(), giving about 1.5%
> speed up when issuing readlink on /initrd.img on ext4.
> 
> Filesystems opt in by calling inode_set_cached_link() when creating an
> inode.
> 
> The size is stored in a new union utilizing the same space as i_devices,
> thus avoiding growing the struct or taking up any more space.
> 
> Churn-wise the current readlink_copy() helper is patched to accept the
> size instead of calculating it.
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>  fs/namei.c                     | 34 +++++++++++++++++++---------------
>  fs/proc/namespaces.c           |  2 +-
>  include/linux/fs.h             | 15 +++++++++++++--
>  security/apparmor/apparmorfs.c |  2 +-
>  4 files changed, 34 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 9d30c7aa9aa6..e56c29a22d26 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
>  				getname(newname), 0);
>  }
>  
> -int readlink_copy(char __user *buffer, int buflen, const char *link)
> +int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
>  {
> -	int len = PTR_ERR(link);
> -	if (IS_ERR(link))
> -		goto out;
> +	int copylen;
>  
> -	len = strlen(link);
> -	if (len > (unsigned) buflen)
> -		len = buflen;
> -	if (copy_to_user(buffer, link, len))
> -		len = -EFAULT;
> -out:
> -	return len;
> +	copylen = linklen;
> +	if (unlikely(copylen > (unsigned) buflen))
> +		copylen = buflen;
> +	if (copy_to_user(buffer, link, copylen))
> +		copylen = -EFAULT;
> +	return copylen;
>  }
>  
>  /**
> @@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
>  	const char *link;
>  	int res;
>  
> +	if (inode->i_opflags & IOP_CACHED_LINK)
> +		return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
> +
>  	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
>  		if (unlikely(inode->i_op->readlink))
>  			return inode->i_op->readlink(dentry, buffer, buflen);
> @@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
>  		if (IS_ERR(link))
>  			return PTR_ERR(link);
>  	}
> -	res = readlink_copy(buffer, buflen, link);
> +	res = readlink_copy(buffer, buflen, link, strlen(link));
>  	do_delayed_call(&done);
>  	return res;
>  }
> @@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link);
>  
>  int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
>  {
> +	const char *link;
> +	int res;
> +
>  	DEFINE_DELAYED_CALL(done);
> -	int res = readlink_copy(buffer, buflen,
> -				page_get_link(dentry, d_inode(dentry),
> -					      &done));
> +	link = page_get_link(dentry, d_inode(dentry), &done);
> +	res = PTR_ERR(link);
> +	if (!IS_ERR(link))
> +		res = readlink_copy(buffer, buflen, link, strlen(link));
>  	do_delayed_call(&done);
>  	return res;
>  }
> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
> index 8e159fc78c0a..c610224faf10 100644
> --- a/fs/proc/namespaces.c
> +++ b/fs/proc/namespaces.c
> @@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
>  	if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
>  		res = ns_get_name(name, sizeof(name), task, ns_ops);
>  		if (res >= 0)
> -			res = readlink_copy(buffer, buflen, name);
> +			res = readlink_copy(buffer, buflen, name, strlen(name));
>  	}
>  	put_task_struct(task);
>  	return res;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 7e29433c5ecc..2cc98de5af43 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -626,6 +626,7 @@ is_uncached_acl(struct posix_acl *acl)
>  #define IOP_XATTR	0x0008
>  #define IOP_DEFAULT_READLINK	0x0010
>  #define IOP_MGTIME	0x0020
> +#define IOP_CACHED_LINK	0x0040
>  
>  /*
>   * Keep mostly read-only and often accessed (especially for
> @@ -723,7 +724,10 @@ struct inode {
>  	};
>  	struct file_lock_context	*i_flctx;
>  	struct address_space	i_data;
> -	struct list_head	i_devices;
> +	union {
> +		struct list_head	i_devices;
> +		int			i_linklen;
> +	};

I think that i_devices should be moved into the union as it's really
only used with i_cdev but it's not that easily done because list_head
needs to be initialized. I roughly envisioned something like:

union {
        struct {
                struct cdev             *i_cdev;
                struct list_head        i_devices;
        };
        struct {
                char                    *i_link;
                unsigned int            i_link_len;
        };
        struct pipe_inode_info          *i_pipe;
        unsigned                        i_dir_seq;
};

But it's not important enough imho.
Re: [PATCH v3 1/3] vfs: support caching symlink lengths in inodes
Posted by Mateusz Guzik 2 hours ago
On Thu, Nov 21, 2024 at 11:12 AM Christian Brauner <brauner@kernel.org> wrote:
> I think that i_devices should be moved into the union as it's really
> only used with i_cdev but it's not that easily done because list_head
> needs to be initialized. I roughly envisioned something like:
>
> union {
>         struct {
>                 struct cdev             *i_cdev;
>                 struct list_head        i_devices;
>         };
>         struct {
>                 char                    *i_link;
>                 unsigned int            i_link_len;
>         };
>         struct pipe_inode_info          *i_pipe;
>         unsigned                        i_dir_seq;
> };
>
> But it's not important enough imho.

I thought about doing that, but decided not to. I mentioned some time
ago that the layout of struct inode is false-sharing friendly and the
kernel is not in shape where this can be sensibly fixed yet and it may
or may not affect what to do with the above.

On the stock kernel the issues are:
- a spurious lockref acquire/release -- I posted a patch for it, Al
did not like it and wrote his own, does not look like it landed though
- apparmor -- everything serializes on label ref management (this *is*
used by ubuntu for example, but also the lkp machinery). Other people
posted patchsets to get rid of the problem, but they ran into their
own snags. I have a wip with of my own.

Anyhow, with these eliminated it will be possible to evaluate what
happens with inode rearrengements. Until then I think any messing with
the layout is best avoided.

thanks for taking the patchset
-- 
Mateusz Guzik <mjguzik gmail.com>