From: Christian Brauner <brauner@kernel.org>
All allocations of struct kernfs_iattrs are serialized through a global
mutex. Simply do a racy allocation and let the first one win. I bet most
callers are under inode->i_rwsem anyway and it wouldn't be needed but
let's not require that.
Signed-off-by: Christian Brauner <brauner@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
---
fs/kernfs/inode.c | 74 +++++++++++++++++++++++++----------------------
1 file changed, 40 insertions(+), 34 deletions(-)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b83054da68b3..f4b73b9482b7 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,45 +24,46 @@ static const struct inode_operations kernfs_iops = {
.listxattr = kernfs_iop_listxattr,
};
-static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
+static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc)
{
- static DEFINE_MUTEX(iattr_mutex);
- struct kernfs_iattrs *ret;
+ struct kernfs_iattrs *ret __free(kfree) = NULL;
+ struct kernfs_iattrs *attr;
- mutex_lock(&iattr_mutex);
+ attr = READ_ONCE(kn->iattr);
+ if (attr || !alloc)
+ return attr;
- if (kn->iattr || !alloc)
- goto out_unlock;
-
- kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
- if (!kn->iattr)
- goto out_unlock;
+ ret = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
+ if (!ret)
+ return NULL;
/* assign default attributes */
- kn->iattr->ia_uid = GLOBAL_ROOT_UID;
- kn->iattr->ia_gid = GLOBAL_ROOT_GID;
-
- ktime_get_real_ts64(&kn->iattr->ia_atime);
- kn->iattr->ia_mtime = kn->iattr->ia_atime;
- kn->iattr->ia_ctime = kn->iattr->ia_atime;
-
- simple_xattrs_init(&kn->iattr->xattrs);
- atomic_set(&kn->iattr->nr_user_xattrs, 0);
- atomic_set(&kn->iattr->user_xattr_size, 0);
-out_unlock:
- ret = kn->iattr;
- mutex_unlock(&iattr_mutex);
- return ret;
+ ret->ia_uid = GLOBAL_ROOT_UID;
+ ret->ia_gid = GLOBAL_ROOT_GID;
+
+ ktime_get_real_ts64(&ret->ia_atime);
+ ret->ia_mtime = ret->ia_atime;
+ ret->ia_ctime = ret->ia_atime;
+
+ simple_xattrs_init(&ret->xattrs);
+ atomic_set(&ret->nr_user_xattrs, 0);
+ atomic_set(&ret->user_xattr_size, 0);
+
+ /* If someone raced us, recognize it. */
+ if (!try_cmpxchg(&kn->iattr, &attr, ret))
+ return READ_ONCE(kn->iattr);
+
+ return no_free_ptr(ret);
}
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
- return __kernfs_iattrs(kn, 1);
+ return __kernfs_iattrs(kn, true);
}
static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
{
- return __kernfs_iattrs(kn, 0);
+ return __kernfs_iattrs(kn, false);
}
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
@@ -141,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
struct kernfs_node *kn = kernfs_dentry_node(dentry);
struct kernfs_iattrs *attrs;
- attrs = kernfs_iattrs(kn);
+ attrs = kernfs_iattrs_noalloc(kn);
if (!attrs)
- return -ENOMEM;
+ return -ENODATA;
return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}
@@ -166,9 +167,10 @@ static inline void set_inode_attr(struct inode *inode,
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
{
- struct kernfs_iattrs *attrs = kn->iattr;
+ struct kernfs_iattrs *attrs;
inode->i_mode = kn->mode;
+ attrs = kernfs_iattrs_noalloc(kn);
if (attrs)
/*
* kernfs_node has non-default attributes get them from
@@ -306,7 +308,9 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
const void *value, size_t size, int flags)
{
struct simple_xattr *old_xattr;
- struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
+ struct kernfs_iattrs *attrs;
+
+ attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
@@ -345,8 +349,9 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
- atomic_t *sz = &kn->iattr->user_xattr_size;
- atomic_t *nr = &kn->iattr->nr_user_xattrs;
+ struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
+ atomic_t *sz = &attr->user_xattr_size;
+ atomic_t *nr = &attr->nr_user_xattrs;
struct simple_xattr *old_xattr;
int ret;
@@ -384,8 +389,9 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
- atomic_t *sz = &kn->iattr->user_xattr_size;
- atomic_t *nr = &kn->iattr->nr_user_xattrs;
+ struct kernfs_iattrs *attr = kernfs_iattrs(kn);
+ atomic_t *sz = &attr->user_xattr_size;
+ atomic_t *nr = &attr->nr_user_xattrs;
struct simple_xattr *old_xattr;
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
--
2.47.1
Hi, On Sun, 2025-06-22 at 23:38 -0700, Song Liu wrote: > From: Christian Brauner <brauner@kernel.org> > > All allocations of struct kernfs_iattrs are serialized through a global > mutex. Simply do a racy allocation and let the first one win. I bet most > callers are under inode->i_rwsem anyway and it wouldn't be needed but > let's not require that. > > Signed-off-by: Christian Brauner <brauner@kernel.org> > Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> > Acked-by: Tejun Heo <tj@kernel.org> > Signed-off-by: Song Liu <song@kernel.org> On next-20250701, ls -lA gives errors on /sys: $ ls -lA /sys/ ls: /sys/: No data available ls: /sys/kernel: No data available ls: /sys/power: No data available ls: /sys/class: No data available ls: /sys/devices: No data available ls: /sys/dev: No data available ls: /sys/hypervisor: No data available ls: /sys/fs: No data available ls: /sys/bus: No data available ls: /sys/firmware: No data available ls: /sys/block: No data available ls: /sys/module: No data available total 0 drwxr-xr-x 2 root root 0 Jan 1 1970 block drwxr-xr-x 52 root root 0 Jan 1 1970 bus drwxr-xr-x 88 root root 0 Jan 1 1970 class drwxr-xr-x 4 root root 0 Jan 1 1970 dev drwxr-xr-x 11 root root 0 Jan 1 1970 devices drwxr-xr-x 3 root root 0 Jan 1 1970 firmware drwxr-xr-x 10 root root 0 Jan 1 1970 fs drwxr-xr-x 2 root root 0 Jul 2 09:43 hypervisor drwxr-xr-x 14 root root 0 Jan 1 1970 kernel drwxr-xr-x 251 root root 0 Jan 1 1970 module drwxr-xr-x 3 root root 0 Jul 2 09:43 power and my bisect is pointing to this commit. Simply reverting it also fixes the errors. Do you have any suggestions? Cheers, Andre'
On Wed, Jul 02, 2025 at 11:47:58AM +0100, André Draszik wrote: > Hi, > > On Sun, 2025-06-22 at 23:38 -0700, Song Liu wrote: > > From: Christian Brauner <brauner@kernel.org> > > > > All allocations of struct kernfs_iattrs are serialized through a global > > mutex. Simply do a racy allocation and let the first one win. I bet most > > callers are under inode->i_rwsem anyway and it wouldn't be needed but > > let's not require that. > > > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> > > Acked-by: Tejun Heo <tj@kernel.org> > > Signed-off-by: Song Liu <song@kernel.org> > > On next-20250701, ls -lA gives errors on /sys: > > $ ls -lA /sys/ > ls: /sys/: No data available > ls: /sys/kernel: No data available > ls: /sys/power: No data available > ls: /sys/class: No data available > ls: /sys/devices: No data available > ls: /sys/dev: No data available > ls: /sys/hypervisor: No data available > ls: /sys/fs: No data available > ls: /sys/bus: No data available > ls: /sys/firmware: No data available > ls: /sys/block: No data available > ls: /sys/module: No data available > total 0 > drwxr-xr-x 2 root root 0 Jan 1 1970 block > drwxr-xr-x 52 root root 0 Jan 1 1970 bus > drwxr-xr-x 88 root root 0 Jan 1 1970 class > drwxr-xr-x 4 root root 0 Jan 1 1970 dev > drwxr-xr-x 11 root root 0 Jan 1 1970 devices > drwxr-xr-x 3 root root 0 Jan 1 1970 firmware > drwxr-xr-x 10 root root 0 Jan 1 1970 fs > drwxr-xr-x 2 root root 0 Jul 2 09:43 hypervisor > drwxr-xr-x 14 root root 0 Jan 1 1970 kernel > drwxr-xr-x 251 root root 0 Jan 1 1970 module > drwxr-xr-x 3 root root 0 Jul 2 09:43 power > > > and my bisect is pointing to this commit. Simply reverting it also fixes > the errors. > > > Do you have any suggestions? Yes, apparently the xattr selftest don't cover sysfs/kernfs. The issue is that the commit changed listxattr() to skip allocation of the xattr header and instead just returned ENODATA. We should just allocate like before tested just now: user1@localhost:~$ sudo ls -al /sys/kernel/ total 0 drwxr-xr-x 17 root root 0 Jul 2 13:41 . dr-xr-xr-x 12 root root 0 Jul 2 13:41 .. -r--r--r-- 1 root root 4096 Jul 2 13:41 address_bits drwxr-xr-x 3 root root 0 Jul 2 13:41 boot_params drwxr-xr-x 2 root root 0 Jul 2 13:41 btf drwxr-xr-x 2 root root 0 Jul 2 13:41 cgroup drwxr-xr-x 2 root root 0 Jul 2 13:41 config -r--r--r-- 1 root root 4096 Jul 2 13:41 cpu_byteorder -r--r--r-- 1 root root 4096 Jul 2 13:41 crash_elfcorehdr_size drwx------ 34 root root 0 Jul 2 13:41 debug -r--r--r-- 1 root root 4096 Jul 2 13:41 fscaps -r--r--r-- 1 root root 4096 Jul 2 13:41 hardlockup_count drwxr-xr-x 2 root root 0 Jul 2 13:41 iommu_groups drwxr-xr-x 344 root root 0 Jul 2 13:41 irq -r--r--r-- 1 root root 4096 Jul 2 13:41 kexec_crash_loaded -rw-r--r-- 1 root root 4096 Jul 2 13:41 kexec_crash_size -r--r--r-- 1 root root 4096 Jul 2 13:41 kexec_loaded drwxr-xr-x 9 root root 0 Jul 2 13:41 mm -r--r--r-- 1 root root 84 Jul 2 13:41 notes -r--r--r-- 1 root root 4096 Jul 2 13:41 oops_count -rw-r--r-- 1 root root 4096 Jul 2 13:41 profiling -rw-r--r-- 1 root root 4096 Jul 2 13:41 rcu_expedited -rw-r--r-- 1 root root 4096 Jul 2 13:41 rcu_normal -r--r--r-- 1 root root 4096 Jul 2 13:41 rcu_stall_count drwxr-xr-x 2 root root 0 Jul 2 13:41 reboot drwxr-xr-x 2 root root 0 Jul 2 13:41 sched_ext drwxr-xr-x 4 root root 0 Jul 2 13:41 security drwxr-xr-x 190 root root 0 Jul 2 13:41 slab -r--r--r-- 1 root root 4096 Jul 2 13:41 softlockup_count drwxr-xr-x 2 root root 0 Jul 2 13:41 software_nodes drwxr-xr-x 4 root root 0 Jul 2 13:41 sunrpc drwxr-xr-x 6 root root 0 Jul 2 13:41 tracing -r--r--r-- 1 root root 4096 Jul 2 13:41 uevent_seqnum -r--r--r-- 1 root root 4096 Jul 2 13:41 vmcoreinfo -r--r--r-- 1 root root 4096 Jul 2 13:41 warn_count I'm folding: diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 3c293a5a21b1..457f91c412d4 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; - attrs = kernfs_iattrs_noalloc(kn); + attrs = kernfs_iattrs(kn); if (!attrs) - return -ENODATA; + return -ENOMEM; return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } which brings it back to the old behavior. I'm also adding a selftest for this behavior. Patch appended.
On 02.07.25 14:17, Christian Brauner wrote: > On Wed, Jul 02, 2025 at 11:47:58AM +0100, André Draszik wrote: >> Hi, >> >> On Sun, 2025-06-22 at 23:38 -0700, Song Liu wrote: >>> From: Christian Brauner <brauner@kernel.org> >>> >>> All allocations of struct kernfs_iattrs are serialized through a global >>> mutex. Simply do a racy allocation and let the first one win. I bet most >>> callers are under inode->i_rwsem anyway and it wouldn't be needed but >>> let's not require that. >>> >>> Signed-off-by: Christian Brauner <brauner@kernel.org> >>> Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> >>> Acked-by: Tejun Heo <tj@kernel.org> >>> Signed-off-by: Song Liu <song@kernel.org> >> >> On next-20250701, ls -lA gives errors on /sys: >> >> $ ls -lA /sys/ >> ls: /sys/: No data available >> ls: /sys/kernel: No data available >> ls: /sys/power: No data available >> ls: /sys/class: No data available >> ls: /sys/devices: No data available >> ls: /sys/dev: No data available >> ls: /sys/hypervisor: No data available >> ls: /sys/fs: No data available >> ls: /sys/bus: No data available >> ls: /sys/firmware: No data available >> ls: /sys/block: No data available >> ls: /sys/module: No data available >> total 0 >> drwxr-xr-x 2 root root 0 Jan 1 1970 block >> drwxr-xr-x 52 root root 0 Jan 1 1970 bus >> drwxr-xr-x 88 root root 0 Jan 1 1970 class >> drwxr-xr-x 4 root root 0 Jan 1 1970 dev >> drwxr-xr-x 11 root root 0 Jan 1 1970 devices >> drwxr-xr-x 3 root root 0 Jan 1 1970 firmware >> drwxr-xr-x 10 root root 0 Jan 1 1970 fs >> drwxr-xr-x 2 root root 0 Jul 2 09:43 hypervisor >> drwxr-xr-x 14 root root 0 Jan 1 1970 kernel >> drwxr-xr-x 251 root root 0 Jan 1 1970 module >> drwxr-xr-x 3 root root 0 Jul 2 09:43 power >> >> >> and my bisect is pointing to this commit. Simply reverting it also fixes >> the errors. >> >> >> Do you have any suggestions? > > Yes, apparently the xattr selftest don't cover sysfs/kernfs. The issue > is that the commit changed listxattr() to skip allocation of the xattr > header and instead just returned ENODATA. We should just allocate like > before tested just now: > > user1@localhost:~$ sudo ls -al /sys/kernel/ > total 0 > drwxr-xr-x 17 root root 0 Jul 2 13:41 . > dr-xr-xr-x 12 root root 0 Jul 2 13:41 .. > -r--r--r-- 1 root root 4096 Jul 2 13:41 address_bits > drwxr-xr-x 3 root root 0 Jul 2 13:41 boot_params > drwxr-xr-x 2 root root 0 Jul 2 13:41 btf > drwxr-xr-x 2 root root 0 Jul 2 13:41 cgroup > drwxr-xr-x 2 root root 0 Jul 2 13:41 config > -r--r--r-- 1 root root 4096 Jul 2 13:41 cpu_byteorder > -r--r--r-- 1 root root 4096 Jul 2 13:41 crash_elfcorehdr_size > drwx------ 34 root root 0 Jul 2 13:41 debug > -r--r--r-- 1 root root 4096 Jul 2 13:41 fscaps > -r--r--r-- 1 root root 4096 Jul 2 13:41 hardlockup_count > drwxr-xr-x 2 root root 0 Jul 2 13:41 iommu_groups > drwxr-xr-x 344 root root 0 Jul 2 13:41 irq > -r--r--r-- 1 root root 4096 Jul 2 13:41 kexec_crash_loaded > -rw-r--r-- 1 root root 4096 Jul 2 13:41 kexec_crash_size > -r--r--r-- 1 root root 4096 Jul 2 13:41 kexec_loaded > drwxr-xr-x 9 root root 0 Jul 2 13:41 mm > -r--r--r-- 1 root root 84 Jul 2 13:41 notes > -r--r--r-- 1 root root 4096 Jul 2 13:41 oops_count > -rw-r--r-- 1 root root 4096 Jul 2 13:41 profiling > -rw-r--r-- 1 root root 4096 Jul 2 13:41 rcu_expedited > -rw-r--r-- 1 root root 4096 Jul 2 13:41 rcu_normal > -r--r--r-- 1 root root 4096 Jul 2 13:41 rcu_stall_count > drwxr-xr-x 2 root root 0 Jul 2 13:41 reboot > drwxr-xr-x 2 root root 0 Jul 2 13:41 sched_ext > drwxr-xr-x 4 root root 0 Jul 2 13:41 security > drwxr-xr-x 190 root root 0 Jul 2 13:41 slab > -r--r--r-- 1 root root 4096 Jul 2 13:41 softlockup_count > drwxr-xr-x 2 root root 0 Jul 2 13:41 software_nodes > drwxr-xr-x 4 root root 0 Jul 2 13:41 sunrpc > drwxr-xr-x 6 root root 0 Jul 2 13:41 tracing > -r--r--r-- 1 root root 4096 Jul 2 13:41 uevent_seqnum > -r--r--r-- 1 root root 4096 Jul 2 13:41 vmcoreinfo > -r--r--r-- 1 root root 4096 Jul 2 13:41 warn_count > > I'm folding: > > diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c > index 3c293a5a21b1..457f91c412d4 100644 > --- a/fs/kernfs/inode.c > +++ b/fs/kernfs/inode.c > @@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) > struct kernfs_node *kn = kernfs_dentry_node(dentry); > struct kernfs_iattrs *attrs; > > - attrs = kernfs_iattrs_noalloc(kn); > + attrs = kernfs_iattrs(kn); > if (!attrs) > - return -ENODATA; > + return -ENOMEM; > > return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); > } > > which brings it back to the old behavior. > ...but it looks like v3 was merged as-is in the end, without this fixup. Is there some separate patch in the pipeline, or was this forgotten? > I'm also adding a selftest for this behavior. Patch appended. Jan -- Siemens AG, Foundational Technologies Linux Expert Center
> ...but it looks like v3 was merged as-is in the end, without this fixup. > Is there some separate patch in the pipeline, or was this forgotten? This is a result of the trees diverging which we discussed earlier. I sent a fix.
On Wed, 2025-07-02 at 14:17 +0200, Christian Brauner wrote: > I'm folding: > > diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c > index 3c293a5a21b1..457f91c412d4 100644 > --- a/fs/kernfs/inode.c > +++ b/fs/kernfs/inode.c > @@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) > struct kernfs_node *kn = kernfs_dentry_node(dentry); > struct kernfs_iattrs *attrs; > > - attrs = kernfs_iattrs_noalloc(kn); > + attrs = kernfs_iattrs(kn); > if (!attrs) > - return -ENODATA; > + return -ENOMEM; > > return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); > } > > which brings it back to the old behavior. Yes, that makes sense and works for me too. Thanks Christian!
© 2016 - 2025 Red Hat, Inc.