[PATCH v3 bpf-next 1/4] kernfs: remove iattr_mutex

Song Liu posted 4 patches 3 months, 2 weeks ago
[PATCH v3 bpf-next 1/4] kernfs: remove iattr_mutex
Posted by Song Liu 3 months, 2 weeks ago
From: Christian Brauner <brauner@kernel.org>

All allocations of struct kernfs_iattrs are serialized through a global
mutex. Simply do a racy allocation and let the first one win. I bet most
callers are under inode->i_rwsem anyway and it wouldn't be needed but
let's not require that.

Signed-off-by: Christian Brauner <brauner@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
---
 fs/kernfs/inode.c | 74 +++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b83054da68b3..f4b73b9482b7 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,45 +24,46 @@ static const struct inode_operations kernfs_iops = {
 	.listxattr	= kernfs_iop_listxattr,
 };
 
-static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
+static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc)
 {
-	static DEFINE_MUTEX(iattr_mutex);
-	struct kernfs_iattrs *ret;
+	struct kernfs_iattrs *ret __free(kfree) = NULL;
+	struct kernfs_iattrs *attr;
 
-	mutex_lock(&iattr_mutex);
+	attr = READ_ONCE(kn->iattr);
+	if (attr || !alloc)
+		return attr;
 
-	if (kn->iattr || !alloc)
-		goto out_unlock;
-
-	kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
-	if (!kn->iattr)
-		goto out_unlock;
+	ret = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
+	if (!ret)
+		return NULL;
 
 	/* assign default attributes */
-	kn->iattr->ia_uid = GLOBAL_ROOT_UID;
-	kn->iattr->ia_gid = GLOBAL_ROOT_GID;
-
-	ktime_get_real_ts64(&kn->iattr->ia_atime);
-	kn->iattr->ia_mtime = kn->iattr->ia_atime;
-	kn->iattr->ia_ctime = kn->iattr->ia_atime;
-
-	simple_xattrs_init(&kn->iattr->xattrs);
-	atomic_set(&kn->iattr->nr_user_xattrs, 0);
-	atomic_set(&kn->iattr->user_xattr_size, 0);
-out_unlock:
-	ret = kn->iattr;
-	mutex_unlock(&iattr_mutex);
-	return ret;
+	ret->ia_uid = GLOBAL_ROOT_UID;
+	ret->ia_gid = GLOBAL_ROOT_GID;
+
+	ktime_get_real_ts64(&ret->ia_atime);
+	ret->ia_mtime = ret->ia_atime;
+	ret->ia_ctime = ret->ia_atime;
+
+	simple_xattrs_init(&ret->xattrs);
+	atomic_set(&ret->nr_user_xattrs, 0);
+	atomic_set(&ret->user_xattr_size, 0);
+
+	/* If someone raced us, recognize it. */
+	if (!try_cmpxchg(&kn->iattr, &attr, ret))
+		return READ_ONCE(kn->iattr);
+
+	return no_free_ptr(ret);
 }
 
 static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
 {
-	return __kernfs_iattrs(kn, 1);
+	return __kernfs_iattrs(kn, true);
 }
 
 static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
 {
-	return __kernfs_iattrs(kn, 0);
+	return __kernfs_iattrs(kn, false);
 }
 
 int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
@@ -141,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
 	struct kernfs_node *kn = kernfs_dentry_node(dentry);
 	struct kernfs_iattrs *attrs;
 
-	attrs = kernfs_iattrs(kn);
+	attrs = kernfs_iattrs_noalloc(kn);
 	if (!attrs)
-		return -ENOMEM;
+		return -ENODATA;
 
 	return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
 }
@@ -166,9 +167,10 @@ static inline void set_inode_attr(struct inode *inode,
 
 static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
 {
-	struct kernfs_iattrs *attrs = kn->iattr;
+	struct kernfs_iattrs *attrs;
 
 	inode->i_mode = kn->mode;
+	attrs = kernfs_iattrs_noalloc(kn);
 	if (attrs)
 		/*
 		 * kernfs_node has non-default attributes get them from
@@ -306,7 +308,9 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
 		     const void *value, size_t size, int flags)
 {
 	struct simple_xattr *old_xattr;
-	struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
 	if (!attrs)
 		return -ENOMEM;
 
@@ -345,8 +349,9 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
 				     struct simple_xattrs *xattrs,
 				     const void *value, size_t size, int flags)
 {
-	atomic_t *sz = &kn->iattr->user_xattr_size;
-	atomic_t *nr = &kn->iattr->nr_user_xattrs;
+	struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
+	atomic_t *sz = &attr->user_xattr_size;
+	atomic_t *nr = &attr->nr_user_xattrs;
 	struct simple_xattr *old_xattr;
 	int ret;
 
@@ -384,8 +389,9 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
 				    struct simple_xattrs *xattrs,
 				    const void *value, size_t size, int flags)
 {
-	atomic_t *sz = &kn->iattr->user_xattr_size;
-	atomic_t *nr = &kn->iattr->nr_user_xattrs;
+	struct kernfs_iattrs *attr = kernfs_iattrs(kn);
+	atomic_t *sz = &attr->user_xattr_size;
+	atomic_t *nr = &attr->nr_user_xattrs;
 	struct simple_xattr *old_xattr;
 
 	old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
-- 
2.47.1
Re: [PATCH v3 bpf-next 1/4] kernfs: remove iattr_mutex
Posted by André Draszik 3 months, 1 week ago
Hi,

On Sun, 2025-06-22 at 23:38 -0700, Song Liu wrote:
> From: Christian Brauner <brauner@kernel.org>
> 
> All allocations of struct kernfs_iattrs are serialized through a global
> mutex. Simply do a racy allocation and let the first one win. I bet most
> callers are under inode->i_rwsem anyway and it wouldn't be needed but
> let's not require that.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Acked-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Song Liu <song@kernel.org>

On next-20250701, ls -lA gives errors on /sys:

$ ls -lA /sys/
ls: /sys/: No data available
ls: /sys/kernel: No data available
ls: /sys/power: No data available
ls: /sys/class: No data available
ls: /sys/devices: No data available
ls: /sys/dev: No data available
ls: /sys/hypervisor: No data available
ls: /sys/fs: No data available
ls: /sys/bus: No data available
ls: /sys/firmware: No data available
ls: /sys/block: No data available
ls: /sys/module: No data available
total 0
drwxr-xr-x   2 root root 0 Jan  1  1970 block
drwxr-xr-x  52 root root 0 Jan  1  1970 bus
drwxr-xr-x  88 root root 0 Jan  1  1970 class
drwxr-xr-x   4 root root 0 Jan  1  1970 dev
drwxr-xr-x  11 root root 0 Jan  1  1970 devices
drwxr-xr-x   3 root root 0 Jan  1  1970 firmware
drwxr-xr-x  10 root root 0 Jan  1  1970 fs
drwxr-xr-x   2 root root 0 Jul  2 09:43 hypervisor
drwxr-xr-x  14 root root 0 Jan  1  1970 kernel
drwxr-xr-x 251 root root 0 Jan  1  1970 module
drwxr-xr-x   3 root root 0 Jul  2 09:43 power


and my bisect is pointing to this commit. Simply reverting it also fixes
the errors.


Do you have any suggestions?


Cheers,
Andre'
Re: [PATCH v3 bpf-next 1/4] kernfs: remove iattr_mutex
Posted by Christian Brauner 3 months, 1 week ago
On Wed, Jul 02, 2025 at 11:47:58AM +0100, André Draszik wrote:
> Hi,
> 
> On Sun, 2025-06-22 at 23:38 -0700, Song Liu wrote:
> > From: Christian Brauner <brauner@kernel.org>
> > 
> > All allocations of struct kernfs_iattrs are serialized through a global
> > mutex. Simply do a racy allocation and let the first one win. I bet most
> > callers are under inode->i_rwsem anyway and it wouldn't be needed but
> > let's not require that.
> > 
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > Acked-by: Tejun Heo <tj@kernel.org>
> > Signed-off-by: Song Liu <song@kernel.org>
> 
> On next-20250701, ls -lA gives errors on /sys:
> 
> $ ls -lA /sys/
> ls: /sys/: No data available
> ls: /sys/kernel: No data available
> ls: /sys/power: No data available
> ls: /sys/class: No data available
> ls: /sys/devices: No data available
> ls: /sys/dev: No data available
> ls: /sys/hypervisor: No data available
> ls: /sys/fs: No data available
> ls: /sys/bus: No data available
> ls: /sys/firmware: No data available
> ls: /sys/block: No data available
> ls: /sys/module: No data available
> total 0
> drwxr-xr-x   2 root root 0 Jan  1  1970 block
> drwxr-xr-x  52 root root 0 Jan  1  1970 bus
> drwxr-xr-x  88 root root 0 Jan  1  1970 class
> drwxr-xr-x   4 root root 0 Jan  1  1970 dev
> drwxr-xr-x  11 root root 0 Jan  1  1970 devices
> drwxr-xr-x   3 root root 0 Jan  1  1970 firmware
> drwxr-xr-x  10 root root 0 Jan  1  1970 fs
> drwxr-xr-x   2 root root 0 Jul  2 09:43 hypervisor
> drwxr-xr-x  14 root root 0 Jan  1  1970 kernel
> drwxr-xr-x 251 root root 0 Jan  1  1970 module
> drwxr-xr-x   3 root root 0 Jul  2 09:43 power
> 
> 
> and my bisect is pointing to this commit. Simply reverting it also fixes
> the errors.
> 
> 
> Do you have any suggestions?

Yes, apparently the xattr selftest don't cover sysfs/kernfs. The issue
is that the commit changed listxattr() to skip allocation of the xattr
header and instead just returned ENODATA. We should just allocate like
before tested just now:

user1@localhost:~$ sudo ls -al /sys/kernel/
total 0
drwxr-xr-x  17 root root    0 Jul  2 13:41 .
dr-xr-xr-x  12 root root    0 Jul  2 13:41 ..
-r--r--r--   1 root root 4096 Jul  2 13:41 address_bits
drwxr-xr-x   3 root root    0 Jul  2 13:41 boot_params
drwxr-xr-x   2 root root    0 Jul  2 13:41 btf
drwxr-xr-x   2 root root    0 Jul  2 13:41 cgroup
drwxr-xr-x   2 root root    0 Jul  2 13:41 config
-r--r--r--   1 root root 4096 Jul  2 13:41 cpu_byteorder
-r--r--r--   1 root root 4096 Jul  2 13:41 crash_elfcorehdr_size
drwx------  34 root root    0 Jul  2 13:41 debug
-r--r--r--   1 root root 4096 Jul  2 13:41 fscaps
-r--r--r--   1 root root 4096 Jul  2 13:41 hardlockup_count
drwxr-xr-x   2 root root    0 Jul  2 13:41 iommu_groups
drwxr-xr-x 344 root root    0 Jul  2 13:41 irq
-r--r--r--   1 root root 4096 Jul  2 13:41 kexec_crash_loaded
-rw-r--r--   1 root root 4096 Jul  2 13:41 kexec_crash_size
-r--r--r--   1 root root 4096 Jul  2 13:41 kexec_loaded
drwxr-xr-x   9 root root    0 Jul  2 13:41 mm
-r--r--r--   1 root root   84 Jul  2 13:41 notes
-r--r--r--   1 root root 4096 Jul  2 13:41 oops_count
-rw-r--r--   1 root root 4096 Jul  2 13:41 profiling
-rw-r--r--   1 root root 4096 Jul  2 13:41 rcu_expedited
-rw-r--r--   1 root root 4096 Jul  2 13:41 rcu_normal
-r--r--r--   1 root root 4096 Jul  2 13:41 rcu_stall_count
drwxr-xr-x   2 root root    0 Jul  2 13:41 reboot
drwxr-xr-x   2 root root    0 Jul  2 13:41 sched_ext
drwxr-xr-x   4 root root    0 Jul  2 13:41 security
drwxr-xr-x 190 root root    0 Jul  2 13:41 slab
-r--r--r--   1 root root 4096 Jul  2 13:41 softlockup_count
drwxr-xr-x   2 root root    0 Jul  2 13:41 software_nodes
drwxr-xr-x   4 root root    0 Jul  2 13:41 sunrpc
drwxr-xr-x   6 root root    0 Jul  2 13:41 tracing
-r--r--r--   1 root root 4096 Jul  2 13:41 uevent_seqnum
-r--r--r--   1 root root 4096 Jul  2 13:41 vmcoreinfo
-r--r--r--   1 root root 4096 Jul  2 13:41 warn_count

I'm folding:

diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3c293a5a21b1..457f91c412d4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
        struct kernfs_node *kn = kernfs_dentry_node(dentry);
        struct kernfs_iattrs *attrs;

-       attrs = kernfs_iattrs_noalloc(kn);
+       attrs = kernfs_iattrs(kn);
        if (!attrs)
-               return -ENODATA;
+               return -ENOMEM;

        return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
 }

which brings it back to the old behavior.

I'm also adding a selftest for this behavior. Patch appended.
Re: [PATCH v3 bpf-next 1/4] kernfs: remove iattr_mutex
Posted by Jan Kiszka 1 month, 3 weeks ago
On 02.07.25 14:17, Christian Brauner wrote:
> On Wed, Jul 02, 2025 at 11:47:58AM +0100, André Draszik wrote:
>> Hi,
>>
>> On Sun, 2025-06-22 at 23:38 -0700, Song Liu wrote:
>>> From: Christian Brauner <brauner@kernel.org>
>>>
>>> All allocations of struct kernfs_iattrs are serialized through a global
>>> mutex. Simply do a racy allocation and let the first one win. I bet most
>>> callers are under inode->i_rwsem anyway and it wouldn't be needed but
>>> let's not require that.
>>>
>>> Signed-off-by: Christian Brauner <brauner@kernel.org>
>>> Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>>> Acked-by: Tejun Heo <tj@kernel.org>
>>> Signed-off-by: Song Liu <song@kernel.org>
>>
>> On next-20250701, ls -lA gives errors on /sys:
>>
>> $ ls -lA /sys/
>> ls: /sys/: No data available
>> ls: /sys/kernel: No data available
>> ls: /sys/power: No data available
>> ls: /sys/class: No data available
>> ls: /sys/devices: No data available
>> ls: /sys/dev: No data available
>> ls: /sys/hypervisor: No data available
>> ls: /sys/fs: No data available
>> ls: /sys/bus: No data available
>> ls: /sys/firmware: No data available
>> ls: /sys/block: No data available
>> ls: /sys/module: No data available
>> total 0
>> drwxr-xr-x   2 root root 0 Jan  1  1970 block
>> drwxr-xr-x  52 root root 0 Jan  1  1970 bus
>> drwxr-xr-x  88 root root 0 Jan  1  1970 class
>> drwxr-xr-x   4 root root 0 Jan  1  1970 dev
>> drwxr-xr-x  11 root root 0 Jan  1  1970 devices
>> drwxr-xr-x   3 root root 0 Jan  1  1970 firmware
>> drwxr-xr-x  10 root root 0 Jan  1  1970 fs
>> drwxr-xr-x   2 root root 0 Jul  2 09:43 hypervisor
>> drwxr-xr-x  14 root root 0 Jan  1  1970 kernel
>> drwxr-xr-x 251 root root 0 Jan  1  1970 module
>> drwxr-xr-x   3 root root 0 Jul  2 09:43 power
>>
>>
>> and my bisect is pointing to this commit. Simply reverting it also fixes
>> the errors.
>>
>>
>> Do you have any suggestions?
> 
> Yes, apparently the xattr selftest don't cover sysfs/kernfs. The issue
> is that the commit changed listxattr() to skip allocation of the xattr
> header and instead just returned ENODATA. We should just allocate like
> before tested just now:
> 
> user1@localhost:~$ sudo ls -al /sys/kernel/
> total 0
> drwxr-xr-x  17 root root    0 Jul  2 13:41 .
> dr-xr-xr-x  12 root root    0 Jul  2 13:41 ..
> -r--r--r--   1 root root 4096 Jul  2 13:41 address_bits
> drwxr-xr-x   3 root root    0 Jul  2 13:41 boot_params
> drwxr-xr-x   2 root root    0 Jul  2 13:41 btf
> drwxr-xr-x   2 root root    0 Jul  2 13:41 cgroup
> drwxr-xr-x   2 root root    0 Jul  2 13:41 config
> -r--r--r--   1 root root 4096 Jul  2 13:41 cpu_byteorder
> -r--r--r--   1 root root 4096 Jul  2 13:41 crash_elfcorehdr_size
> drwx------  34 root root    0 Jul  2 13:41 debug
> -r--r--r--   1 root root 4096 Jul  2 13:41 fscaps
> -r--r--r--   1 root root 4096 Jul  2 13:41 hardlockup_count
> drwxr-xr-x   2 root root    0 Jul  2 13:41 iommu_groups
> drwxr-xr-x 344 root root    0 Jul  2 13:41 irq
> -r--r--r--   1 root root 4096 Jul  2 13:41 kexec_crash_loaded
> -rw-r--r--   1 root root 4096 Jul  2 13:41 kexec_crash_size
> -r--r--r--   1 root root 4096 Jul  2 13:41 kexec_loaded
> drwxr-xr-x   9 root root    0 Jul  2 13:41 mm
> -r--r--r--   1 root root   84 Jul  2 13:41 notes
> -r--r--r--   1 root root 4096 Jul  2 13:41 oops_count
> -rw-r--r--   1 root root 4096 Jul  2 13:41 profiling
> -rw-r--r--   1 root root 4096 Jul  2 13:41 rcu_expedited
> -rw-r--r--   1 root root 4096 Jul  2 13:41 rcu_normal
> -r--r--r--   1 root root 4096 Jul  2 13:41 rcu_stall_count
> drwxr-xr-x   2 root root    0 Jul  2 13:41 reboot
> drwxr-xr-x   2 root root    0 Jul  2 13:41 sched_ext
> drwxr-xr-x   4 root root    0 Jul  2 13:41 security
> drwxr-xr-x 190 root root    0 Jul  2 13:41 slab
> -r--r--r--   1 root root 4096 Jul  2 13:41 softlockup_count
> drwxr-xr-x   2 root root    0 Jul  2 13:41 software_nodes
> drwxr-xr-x   4 root root    0 Jul  2 13:41 sunrpc
> drwxr-xr-x   6 root root    0 Jul  2 13:41 tracing
> -r--r--r--   1 root root 4096 Jul  2 13:41 uevent_seqnum
> -r--r--r--   1 root root 4096 Jul  2 13:41 vmcoreinfo
> -r--r--r--   1 root root 4096 Jul  2 13:41 warn_count
> 
> I'm folding:
> 
> diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
> index 3c293a5a21b1..457f91c412d4 100644
> --- a/fs/kernfs/inode.c
> +++ b/fs/kernfs/inode.c
> @@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
>         struct kernfs_node *kn = kernfs_dentry_node(dentry);
>         struct kernfs_iattrs *attrs;
> 
> -       attrs = kernfs_iattrs_noalloc(kn);
> +       attrs = kernfs_iattrs(kn);
>         if (!attrs)
> -               return -ENODATA;
> +               return -ENOMEM;
> 
>         return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
>  }
> 
> which brings it back to the old behavior.
> 

...but it looks like v3 was merged as-is in the end, without this fixup.
Is there some separate patch in the pipeline, or was this forgotten?

> I'm also adding a selftest for this behavior. Patch appended.

Jan

-- 
Siemens AG, Foundational Technologies
Linux Expert Center
Re: [PATCH v3 bpf-next 1/4] kernfs: remove iattr_mutex
Posted by Christian Brauner 1 month, 2 weeks ago
> ...but it looks like v3 was merged as-is in the end, without this fixup.
> Is there some separate patch in the pipeline, or was this forgotten?

This is a result of the trees diverging which we discussed earlier.
I sent a fix.
Re: [PATCH v3 bpf-next 1/4] kernfs: remove iattr_mutex
Posted by André Draszik 3 months, 1 week ago
On Wed, 2025-07-02 at 14:17 +0200, Christian Brauner wrote:
> I'm folding:
> 
> diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
> index 3c293a5a21b1..457f91c412d4 100644
> --- a/fs/kernfs/inode.c
> +++ b/fs/kernfs/inode.c
> @@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
>         struct kernfs_node *kn = kernfs_dentry_node(dentry);
>         struct kernfs_iattrs *attrs;
> 
> -       attrs = kernfs_iattrs_noalloc(kn);
> +       attrs = kernfs_iattrs(kn);
>         if (!attrs)
> -               return -ENODATA;
> +               return -ENOMEM;
> 
>         return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
>  }
> 
> which brings it back to the old behavior.

Yes, that makes sense and works for me too.

Thanks Christian!