fs/proc/inode.c | 2 +- fs/proc/proc_sysctl.c | 35 ++++------------------------------- 2 files changed, 5 insertions(+), 32 deletions(-)
Some sysctl tables can provide an is_seen() function which reports if
the sysctl should be visible to the current process. This is currently
used to cause d_compare to fail for invisible sysctls.
This technique might have worked in 2.6.26 when it was implemented, but
it cannot work now. In particular if ->d_compare always fails for a
particular name, then d_alloc_parallel() will always create a new dentry
and pass it to lookup() resulting in a new inode for every lookup. I
tested this by changing sysctl_is_seen() to always return 0. When
all sysctls were still visible and repeated lookups (ls -li) reported
different inode numbers.
This patch discards proc_sys_compare (the d_compare function) and
instead adds the is_seen functionality into proc_sys_revalidate (the
d_revalidate function). If the sysctl table is unregistering, 0 is
returned. Otherwise if is_seen() exists but fails, -ENOENT is returned.
The rcu_dereference() and RCU_INIT_POINTER() for ->sysctl are removed as
the field is not rcu-managed. It is only changed when the inode is
created and when it is evicted, and in these cases there can be no
possible concurrent access.
Signed-off-by: NeilBrown <neil@brown.name>
---
fs/proc/inode.c | 2 +-
fs/proc/proc_sysctl.c | 35 ++++-------------------------------
2 files changed, 5 insertions(+), 32 deletions(-)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index a3eb3b740f76..c3991dd314d9 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -42,7 +42,7 @@ static void proc_evict_inode(struct inode *inode)
head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(ei->sysctl, NULL);
+ ei->sysctl = NULL;
proc_sys_evict_inode(inode, head);
}
}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index cc9d74a06ff0..dbf652b50909 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -884,21 +884,15 @@ static const struct inode_operations proc_sys_dir_operations = {
.getattr = proc_sys_getattr,
};
-static int proc_sys_revalidate(struct inode *dir, const struct qstr *name,
- struct dentry *dentry, unsigned int flags)
-{
- if (flags & LOOKUP_RCU)
- return -ECHILD;
- return !PROC_I(d_inode(dentry))->sysctl->unregistering;
-}
-
static int proc_sys_delete(const struct dentry *dentry)
{
return !!PROC_I(d_inode(dentry))->sysctl->unregistering;
}
-static int sysctl_is_seen(struct ctl_table_header *p)
+static int proc_sys_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
+ struct ctl_table_header *p = PROC_I(d_inode(dentry))->sysctl;
struct ctl_table_set *set = p->set;
int res;
spin_lock(&sysctl_lock);
@@ -907,35 +901,14 @@ static int sysctl_is_seen(struct ctl_table_header *p)
else if (!set->is_seen)
res = 1;
else
- res = set->is_seen(set);
+ res = set->is_seen(set) ? 1 : -ENOENT;
spin_unlock(&sysctl_lock);
return res;
}
-static int proc_sys_compare(const struct dentry *dentry,
- unsigned int len, const char *str, const struct qstr *name)
-{
- struct ctl_table_header *head;
- struct inode *inode;
-
- /* Although proc doesn't have negative dentries, rcu-walk means
- * that inode here can be NULL */
- /* AV: can it, indeed? */
- inode = d_inode_rcu(dentry);
- if (!inode)
- return 1;
- if (name->len != len)
- return 1;
- if (memcmp(name->name, str, len))
- return 1;
- head = rcu_dereference(PROC_I(inode)->sysctl);
- return !head || !sysctl_is_seen(head);
-}
-
static const struct dentry_operations proc_sys_dentry_operations = {
.d_revalidate = proc_sys_revalidate,
.d_delete = proc_sys_delete,
- .d_compare = proc_sys_compare,
};
static struct ctl_dir *find_subdir(struct ctl_dir *dir,
--
2.49.0
Hello, kernel test robot noticed "segfault_at_ip_sp_error" on: commit: 981dfb28e2d5851435754da557baadfdcbc828a8 ("[PATCH] proc_sysctl: Fix up ->is_seen() handling") url: https://github.com/intel-lab-lkp/linux/commits/NeilBrown/proc_sysctl-Fix-up-is_seen-handling/20250613-083900 base: https://git.kernel.org/cgit/linux/kernel/git/sysctl/sysctl.git sysctl-next patch link: https://lore.kernel.org/all/174977507817.608730.3467596162021780258@noble.neil.brown.name/ patch subject: [PATCH] proc_sysctl: Fix up ->is_seen() handling in testcase: ltp version: ltp-x86_64-99ebf35b3-1_20250607 with following parameters: test: net.tirpc_tests config: x86_64-rhel-9.4-ltp compiler: gcc-12 test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz (Ivy Bridge) with 16G memory (please refer to attached dmesg/kmsg for entire log/backtrace) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@intel.com> | Closes: https://lore.kernel.org/oe-lkp/202506161619.6738c86c-lkp@intel.com kern :info : [ 157.002007] tirpc_clnt_cont[10507]: segfault at 8 ip 000055c92e030119 sp 00007ffd50302fb0 error 4 in tirpc_clnt_control[1119,55c92e030000+1000] likely on CPU 2 (core 2, socket 0) kern :info : [ 157.002435] Code: 1d 0f 00 00 31 db 48 89 c1 4c 8d 44 24 10 ba 01 00 00 00 0f 29 44 24 10 e8 44 ff ff ff 48 8d 54 24 0c be 06 00 00 00 48 89 c7 <48> 8b 40 08 ff 50 28 48 8d 35 e3 0e 00 00 bf 01 00 00 00 83 f8 01 All code ======== 0: 1d 0f 00 00 31 sbb $0x3100000f,%eax 5: db 48 89 fisttpl -0x77(%rax) 8: c1 4c 8d 44 24 rorl $0x24,0x44(%rbp,%rcx,4) d: 10 ba 01 00 00 00 adc %bh,0x1(%rdx) 13: 0f 29 44 24 10 movaps %xmm0,0x10(%rsp) 18: e8 44 ff ff ff call 0xffffffffffffff61 1d: 48 8d 54 24 0c lea 0xc(%rsp),%rdx 22: be 06 00 00 00 mov $0x6,%esi 27: 48 89 c7 mov %rax,%rdi 2a:* 48 8b 40 08 mov 0x8(%rax),%rax <-- trapping instruction 2e: ff 50 28 call *0x28(%rax) 31: 48 8d 35 e3 0e 00 00 lea 0xee3(%rip),%rsi # 0xf1b 38: bf 01 00 00 00 mov $0x1,%edi 3d: 83 f8 01 cmp $0x1,%eax Code starting with the faulting instruction =========================================== 0: 48 8b 40 08 mov 0x8(%rax),%rax 4: ff 50 28 call *0x28(%rax) 7: 48 8d 35 e3 0e 00 00 lea 0xee3(%rip),%rsi # 0xef1 e: bf 01 00 00 00 mov $0x1,%edi 13: 83 f8 01 cmp $0x1,%eax user :warn : [ 157.168019] LTP: starting tirpc_rpc_reg (rpc_test.sh -c tirpc_rpc_reg) user :warn : [ 158.080633] LTP: starting tirpc_rpc_call (rpc_test.sh -s tirpc_svc_1 -c tirpc_rpc_call) user :warn : [ 159.209909] LTP: starting tirpc_rpc_broadcast (rpc_test.sh -s tirpc_svc_1 -c tirpc_rpc_broadcast) user :warn : [ 160.241552] LTP: starting tirpc_rpc_broadcast_exp (rpc_test.sh -s tirpc_svc_1 -c tirpc_rpc_broadcast_exp -e "1,2") user :warn : [ 161.182028] LTP: starting tirpc_clnt_create (rpc_test.sh -s tirpc_svc_2 -c tirpc_clnt_create) user :warn : [ 162.081857] LTP: starting tirpc_clnt_create_timed (rpc_test.sh -s tirpc_svc_2 -c tirpc_clnt_create_timed) user :warn : [ 163.234462] LTP: starting tirpc_svc_create (rpc_test.sh -c tirpc_svc_create) user :notice: [ 163.507518] Gnu C gcc (Debian 12.2.0-14+deb12u1) 12.2.0 user :notice: [ 163.508419] Clang user :notice: [ 163.509434] Gnu make 4.3 user :notice: [ 163.510405] util-linux 2.38.1 user :warn : [ 164.010443] LTP: starting tirpc_toplevel_clnt_call (rpc_test.sh -s tirpc_svc_2 -c tirpc_toplevel_clnt_call) user :warn : [ 165.054172] LTP: starting tirpc_clnt_destroy (rpc_test.sh -s tirpc_svc_2 -c tirpc_clnt_destroy) user :warn : [ 166.170592] LTP: starting tirpc_svc_destroy (rpc_test.sh -c tirpc_svc_destroy) user :err : [ 166.988060] ------------------------------------------- The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20250616/202506161619.6738c86c-lkp@intel.com -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On Fri, Jun 13, 2025 at 10:37:58AM +1000, NeilBrown wrote: > > Some sysctl tables can provide an is_seen() function which reports if > the sysctl should be visible to the current process. This is currently > used to cause d_compare to fail for invisible sysctls. > > This technique might have worked in 2.6.26 when it was implemented, but > it cannot work now. In particular if ->d_compare always fails for a > particular name, then d_alloc_parallel() will always create a new dentry > and pass it to lookup() resulting in a new inode for every lookup. I > tested this by changing sysctl_is_seen() to always return 0. When > all sysctls were still visible and repeated lookups (ls -li) reported > different inode numbers. What do you mean, "name"?
On Fri, Jun 13, 2025 at 02:54:21AM +0100, Al Viro wrote: > On Fri, Jun 13, 2025 at 10:37:58AM +1000, NeilBrown wrote: > > > > Some sysctl tables can provide an is_seen() function which reports if > > the sysctl should be visible to the current process. This is currently > > used to cause d_compare to fail for invisible sysctls. > > > > This technique might have worked in 2.6.26 when it was implemented, but > > it cannot work now. In particular if ->d_compare always fails for a > > particular name, then d_alloc_parallel() will always create a new dentry > > and pass it to lookup() resulting in a new inode for every lookup. I > > tested this by changing sysctl_is_seen() to always return 0. When > > all sysctls were still visible and repeated lookups (ls -li) reported > > different inode numbers. > > What do you mean, "name"? The whole fucking point of that thing is that /proc/sys/net contents for processes in different netns is not the same. And such processes should not screw each other into the ground by doing lookups in that area. Yes, it means multiple children of the same dentry having the same name *and* staying hashed at the same time.
On Fri, 13 Jun 2025, Al Viro wrote: > On Fri, Jun 13, 2025 at 02:54:21AM +0100, Al Viro wrote: > > On Fri, Jun 13, 2025 at 10:37:58AM +1000, NeilBrown wrote: > > > > > > Some sysctl tables can provide an is_seen() function which reports if > > > the sysctl should be visible to the current process. This is currently > > > used to cause d_compare to fail for invisible sysctls. > > > > > > This technique might have worked in 2.6.26 when it was implemented, but > > > it cannot work now. In particular if ->d_compare always fails for a > > > particular name, then d_alloc_parallel() will always create a new dentry > > > and pass it to lookup() resulting in a new inode for every lookup. I > > > tested this by changing sysctl_is_seen() to always return 0. When > > > all sysctls were still visible and repeated lookups (ls -li) reported > > > different inode numbers. > > > > What do you mean, "name"? > > The whole fucking point of that thing is that /proc/sys/net contents for > processes in different netns is not the same. And such processes should > not screw each other into the ground by doing lookups in that area. > > Yes, it means multiple children of the same dentry having the same name > *and* staying hashed at the same time. > If two threads in the same namespace look up the same name at the same time (which previously didn't exist), they will both enter d_alloc_parallel() where neither will notice the other, so both will create and install d_in_lookup() dentries, and then both will call ->lookup, creating two identical inodes. I suspect that isn't fatal, but it does seem odd. Maybe proc_sys_compare should return 0 for d_in_lookup() (aka !inode) dentries, and then proc_sys_revalidate() can perform the is_seen test and return -EAGAIN if needed, and __lookup_slow() and others could interpret that as meaning to "goto again" without calling d_invalidate(). Maybe. NeilBrown
On Fri, Jun 13, 2025 at 12:37:33PM +1000, NeilBrown wrote: > If two threads in the same namespace look up the same name at the same > time (which previously didn't exist), they will both enter > d_alloc_parallel() where neither will notice the other, so both will > create and install d_in_lookup() dentries, and then both will call > ->lookup, creating two identical inodes. > > I suspect that isn't fatal, but it does seem odd. > > Maybe proc_sys_compare should return 0 for d_in_lookup() (aka !inode) > dentries, and then proc_sys_revalidate() can perform the is_seen test > and return -EAGAIN if needed, and __lookup_slow() and others could > interpret that as meaning to "goto again" without calling > d_invalidate(). Umm... Not sure it's the best solution; let me think a bit. Just need to finish going through the ported rpc_pipefs series for the final look and posting it; should be about half an hour or so...
On Fri, Jun 13, 2025 at 03:41:34AM +0100, Al Viro wrote: > On Fri, Jun 13, 2025 at 12:37:33PM +1000, NeilBrown wrote: > > > If two threads in the same namespace look up the same name at the same > > time (which previously didn't exist), they will both enter > > d_alloc_parallel() where neither will notice the other, so both will > > create and install d_in_lookup() dentries, and then both will call > > ->lookup, creating two identical inodes. > > > > I suspect that isn't fatal, but it does seem odd. > > > > Maybe proc_sys_compare should return 0 for d_in_lookup() (aka !inode) > > dentries, and then proc_sys_revalidate() can perform the is_seen test > > and return -EAGAIN if needed, and __lookup_slow() and others could > > interpret that as meaning to "goto again" without calling > > d_invalidate(). > > Umm... Not sure it's the best solution; let me think a bit. Just need > to finish going through the ported rpc_pipefs series for the final look > and posting it; should be about half an hour or so... FWIW, I think we need the following: mismatch in name/len => return 1 in_lookup => return 0, let the fucker get rechecked later when it ceases to be in_lookup; can only happen when we are called from d_alloc_parallel(). otherwise, NULL inode => return 1; we are seeing a dentry halfway through __dentry_kill(); caller is a lockless dcache lookup, under RCU otherwise, check ->sysctl and sysctl_is_seen(). And yes, you do need rcu_dereference() there. Caller must be holding rcu_read_lock or dentry->d_lock or have a counting reference to dentry.
On Fri, 13 Jun 2025, Al Viro wrote: > On Fri, Jun 13, 2025 at 02:54:21AM +0100, Al Viro wrote: > > On Fri, Jun 13, 2025 at 10:37:58AM +1000, NeilBrown wrote: > > > > > > Some sysctl tables can provide an is_seen() function which reports if > > > the sysctl should be visible to the current process. This is currently > > > used to cause d_compare to fail for invisible sysctls. > > > > > > This technique might have worked in 2.6.26 when it was implemented, but > > > it cannot work now. In particular if ->d_compare always fails for a > > > particular name, then d_alloc_parallel() will always create a new dentry > > > and pass it to lookup() resulting in a new inode for every lookup. I > > > tested this by changing sysctl_is_seen() to always return 0. When > > > all sysctls were still visible and repeated lookups (ls -li) reported > > > different inode numbers. > > > > What do you mean, "name"? > > The whole fucking point of that thing is that /proc/sys/net contents for > processes in different netns is not the same. And such processes should > not screw each other into the ground by doing lookups in that area. > > Yes, it means multiple children of the same dentry having the same name > *and* staying hashed at the same time. > Ahh - I misunderstood the meaning of "is_seen". It means "matches current namespace". I think I have a slightly better understanding now - thanks. I'll just remove the rcu stuff, which is pointless. Thanks, NeilBrown
© 2016 - 2025 Red Hat, Inc.