This patch introduces erofs_lookup_rust and erofs_get_parent_rust
written in Rust as an alternative to the original namei.c.
Signed-off-by: Yiyang Wu <toolmanp@tlmp.cc>
---
fs/erofs/Makefile | 2 +-
fs/erofs/internal.h | 2 ++
fs/erofs/namei.c | 2 ++
fs/erofs/namei_rs.rs | 56 ++++++++++++++++++++++++++++++++++++++++
fs/erofs/rust_bindings.h | 4 ++-
fs/erofs/super.c | 2 ++
6 files changed, 66 insertions(+), 2 deletions(-)
create mode 100644 fs/erofs/namei_rs.rs
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 46de6f490ca2..0f748f3e0ff6 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -9,4 +9,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
-erofs-$(CONFIG_EROFS_FS_RUST) += super_rs.o inode_rs.o rust_helpers.o
+erofs-$(CONFIG_EROFS_FS_RUST) += super_rs.o inode_rs.o namei_rs.o rust_helpers.o
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 42ce84783be7..1d9dfae285d5 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -442,6 +442,8 @@ int erofs_iget5_eq(struct inode *inode, void *opaque);
int erofs_iget5_set(struct inode *inode, void *opaque);
#ifdef CONFIG_EROFS_FS_RUST
#define erofs_iget erofs_iget_rust
+#define erofs_get_parent erofs_get_parent_rust
+#define erofs_lookup erofs_lookup_rust
#else
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
#endif
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index c94d0c1608a8..f657d475c4a1 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -7,6 +7,7 @@
#include "xattr.h"
#include <trace/events/erofs.h>
+#ifndef CONFIG_EROFS_FS_RUST
struct erofs_qstr {
const unsigned char *name;
const unsigned char *end;
@@ -214,6 +215,7 @@ static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
inode = erofs_iget(dir->i_sb, nid);
return d_splice_alias(inode, dentry);
}
+#endif
const struct inode_operations erofs_dir_iops = {
.lookup = erofs_lookup,
diff --git a/fs/erofs/namei_rs.rs b/fs/erofs/namei_rs.rs
new file mode 100644
index 000000000000..d73a0a7bee1e
--- /dev/null
+++ b/fs/erofs/namei_rs.rs
@@ -0,0 +1,56 @@
+// Copyright 2024 Yiyang Wu
+// SPDX-License-Identifier: MIT or GPL-2.0-or-later
+
+//! EROFS Rust Kernel Module Helpers Implementation
+//! This is only for experimental purpose. Feedback is always welcome.
+
+#[allow(dead_code)]
+#[allow(missing_docs)]
+pub(crate) mod rust;
+use core::ffi::*;
+use core::ptr::NonNull;
+
+use kernel::bindings::{d_obtain_alias, d_splice_alias, dentry, inode};
+use kernel::container_of;
+
+use rust::{erofs_sys::operations::*, kinode::*, ksuperblock::*};
+
+/// Lookup function for dentry-inode lookup replacement.
+#[no_mangle]
+pub unsafe extern "C" fn erofs_lookup_rust(
+ k_inode: NonNull<inode>,
+ dentry: NonNull<dentry>,
+ _flags: c_uint,
+) -> *mut c_void {
+ // SAFETY: We are sure that the inode is a Kernel Inode since alloc_inode is called
+ let erofs_inode = unsafe { &*container_of!(k_inode.as_ptr(), KernelInode, k_inode) };
+ // SAFETY: The super_block is initialized when the erofs_alloc_sbi_rust is called.
+ let sbi = erofs_sbi(unsafe { NonNull::new(k_inode.as_ref().i_sb).unwrap() });
+ // SAFETY: this is backed by qstr which is c representation of a valid slice.
+ let name = unsafe {
+ core::str::from_utf8_unchecked(core::slice::from_raw_parts(
+ dentry.as_ref().d_name.name,
+ dentry.as_ref().d_name.__bindgen_anon_1.__bindgen_anon_1.len as usize,
+ ))
+ };
+ let k_inode: *mut inode =
+ dir_lookup(sbi.filesystem.as_ref(), &mut sbi.inodes, erofs_inode, name)
+ .map_or(core::ptr::null_mut(), |result| result.k_inode.as_mut_ptr());
+
+ // SAFETY: We are sure that the inner k_inode has already been initialized.
+ unsafe { d_splice_alias(k_inode, dentry.as_ptr()).cast() }
+}
+
+/// Exported as a replacement of erofs_get_parent.
+#[no_mangle]
+pub unsafe extern "C" fn erofs_get_parent_rust(child: NonNull<dentry>) -> *mut c_void {
+ // SAFETY: We are sure that the inode is a Kernel Inode since alloc_inode is called
+ let k_inode = unsafe { child.as_ref().d_inode };
+ // SAFETY: The super_block is initialized when the erofs_alloc_sbi_rust is called.
+ let sbi = erofs_sbi(unsafe { NonNull::new((*k_inode).i_sb).unwrap() }); // SAFETY: We are sure that the inode is a Kernel Inode since alloc_inode is called
+ let inode = unsafe { &*container_of!(k_inode, KernelInode, k_inode) };
+ let k_inode: *mut inode = dir_lookup(sbi.filesystem.as_ref(), &mut sbi.inodes, inode, "..")
+ .map_or(core::ptr::null_mut(), |result| result.k_inode.as_mut_ptr());
+ // SAFETY: We are sure that the inner k_inode has already been initialized
+ unsafe { d_obtain_alias(k_inode).cast() }
+}
diff --git a/fs/erofs/rust_bindings.h b/fs/erofs/rust_bindings.h
index 657f109dd6e7..b35014aa5cae 100644
--- a/fs/erofs/rust_bindings.h
+++ b/fs/erofs/rust_bindings.h
@@ -6,7 +6,6 @@
#include <linux/fs.h>
-
typedef u64 erofs_nid_t;
typedef u64 erofs_off_t;
/* data type for filesystem-wide blocks number */
@@ -21,4 +20,7 @@ extern void *erofs_alloc_sbi_rust(struct super_block *sb);
extern void *erofs_free_sbi_rust(struct super_block *sb);
extern int erofs_iget5_eq_rust(struct inode *inode, void *opaque);
extern struct inode *erofs_iget_rust(struct super_block *sb, erofs_nid_t nid);
+extern struct dentry *erofs_lookup_rust(struct inode *inode, struct dentry *dentry,
+ unsigned int flags);
+extern struct dentry *erofs_get_parent_rust(struct dentry *dentry);
#endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 659502bdf5fe..d49c804acf3d 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -554,6 +554,7 @@ static struct dentry *erofs_fh_to_parent(struct super_block *sb,
erofs_nfs_get_inode);
}
+#ifndef CONFIG_EROFS_FS_RUST
static struct dentry *erofs_get_parent(struct dentry *child)
{
erofs_nid_t nid;
@@ -565,6 +566,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
return ERR_PTR(err);
return d_obtain_alias(erofs_iget(child->d_sb, nid));
}
+#endif
static const struct export_operations erofs_export_ops = {
.encode_fh = generic_encode_ino32_fh,
--
2.46.0
On Mon, Sep 16, 2024 at 09:56:29PM +0800, Yiyang Wu wrote: > +/// Lookup function for dentry-inode lookup replacement. > +#[no_mangle] > +pub unsafe extern "C" fn erofs_lookup_rust( > + k_inode: NonNull<inode>, > + dentry: NonNull<dentry>, > + _flags: c_uint, > +) -> *mut c_void { > + // SAFETY: We are sure that the inode is a Kernel Inode since alloc_inode is called > + let erofs_inode = unsafe { &*container_of!(k_inode.as_ptr(), KernelInode, k_inode) }; Ummm... A wrapper would be highly useful. And the reason why it's safe is different - your function is called only via ->i_op->lookup, the is only one instance of inode_operations that has that ->lookup method, and the only place where an inode gets ->i_op set to that is erofs_fill_inode(). Which is always passed erofs_inode::vfs_inode. > + // SAFETY: The super_block is initialized when the erofs_alloc_sbi_rust is called. > + let sbi = erofs_sbi(unsafe { NonNull::new(k_inode.as_ref().i_sb).unwrap() }); Again, that calls for a wrapper - this time not erofs-specific; inode->i_sb is *always* non-NULL, is assign-once and always points to live struct super_block instance at least until the call of destroy_inode(). > + // SAFETY: this is backed by qstr which is c representation of a valid slice. What is that sentence supposed to mean? Nevermind "why is it correct"... > + let name = unsafe { > + core::str::from_utf8_unchecked(core::slice::from_raw_parts( > + dentry.as_ref().d_name.name, > + dentry.as_ref().d_name.__bindgen_anon_1.__bindgen_anon_1.len as usize, Is that supposed to be an example of idiomatic Rust? I'm not trying to be snide, but my interest here is mostly about safety of access to VFS data structures. And ->d_name is _very_ unpleasant in that respect; the locking rules required for its stability are subtle and hard to verify on manual code audit. Current erofs_lookup() (and your version as well) *is* indeed safe in that respect, but the proof (from filesystem POV) is that "it's called only as ->lookup() instance, so dentry is initially unhashed negative and will remain such until it's passed to d_splice_alias(); until that point it is guaranteed to have ->d_name and ->d_parent stable". Note that once you _have_ called d_splice_alias(), you can't count upon the ->d_name stability - or, indeed, upon ->d_name.name you've sampled still pointing to allocated memory. For directory-modifying methods it's "stable, since parent is held exclusive". Some internal function called from different environments? Well... Swear, look through the call graph and see what can be proven for each. Expressing that kind of fun in any kind of annotations (Rust type system included) is not pleasant. _Probably_ might be handled by a type that would be a dentry pointer with annotation along the lines "->d_name and ->d_parent of that one are stable". Then e.g. ->lookup() would take that thing as an argument and d_splice_alias() would consume it. ->mkdir() would get the same thing, etc. I hadn't tried to get that all way through (the amount of annotation churn in existing filesystems would be high and hard to split into reviewable patch series), so there might be dragons - and there definitely are places where the stability is proven in different ways (e.g. if dentry->d_lock is held, we have the damn thing stable; then there's a "take a safe snapshot of name" API; etc.). I want to reduce the PITA of regular code audits. If, at some point, Rust use in parts of the tree reduces that - wonderful. But then we'd better make sure that Rust-side uses _are_ safe, accurately annotated and easy to grep for. Because we'll almost certainly need to change method calling conventions at some points through all of that. Even if it's just the annotation-level, any such contract change (it is doable and quite a few had been done) will require going through the instances and checking how much massage will be needed in those. Opaque chunks like the above promise to make that very painful...
On Mon, Sep 16, 2024 at 06:08:01PM GMT, Al Viro wrote: > On Mon, Sep 16, 2024 at 09:56:29PM +0800, Yiyang Wu wrote: > > +/// Lookup function for dentry-inode lookup replacement. > > +#[no_mangle] > > +pub unsafe extern "C" fn erofs_lookup_rust( > > + k_inode: NonNull<inode>, > > + dentry: NonNull<dentry>, > > + _flags: c_uint, > > +) -> *mut c_void { > > + // SAFETY: We are sure that the inode is a Kernel Inode since alloc_inode is called > > + let erofs_inode = unsafe { &*container_of!(k_inode.as_ptr(), KernelInode, k_inode) }; > > Ummm... A wrapper would be highly useful. And the reason why > it's safe is different - your function is called only via ->i_op->lookup, > the is only one instance of inode_operations that has that ->lookup > method, and the only place where an inode gets ->i_op set to that > is erofs_fill_inode(). Which is always passed erofs_inode::vfs_inode. > So my original intention behind this is that all vfs_inodes come from that erofs_iget function and it's always gets initialized in this case And this just followes the same convention here. I can document this more precisely. > > + // SAFETY: The super_block is initialized when the erofs_alloc_sbi_rust is called. > > + let sbi = erofs_sbi(unsafe { NonNull::new(k_inode.as_ref().i_sb).unwrap() }); > > Again, that calls for a wrapper - this time not erofs-specific; > inode->i_sb is *always* non-NULL, is assign-once and always points > to live struct super_block instance at least until the call of > destroy_inode(). > Will be modified correctly, I'm not a native speaker and I just can't find a better way, I will take my note here. > > + // SAFETY: this is backed by qstr which is c representation of a valid slice. > > What is that sentence supposed to mean? Nevermind "why is it correct"... > > > + let name = unsafe { > > + core::str::from_utf8_unchecked(core::slice::from_raw_parts( > > + dentry.as_ref().d_name.name, > > + dentry.as_ref().d_name.__bindgen_anon_1.__bindgen_anon_1.len as usize, > > Is that supposed to be an example of idiomatic Rust? I'm not > trying to be snide, but my interest here is mostly about safety of > access to VFS data structures. And ->d_name is _very_ unpleasant in > that respect; the locking rules required for its stability are subtle > and hard to verify on manual code audit. > Yeah, this code is pretty messed up. I just cannot find a better way to use this qstr in dentry. So the original C qstr is this. ```c struct qstr { union { struct { HASH_LEN_DECLARE; }; u64 hash_len; }; const unsigned char *name; }; ``` The original C code can use this pretty easily. ```C int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type) { int ndirents; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; struct erofs_qstr qn; if (!dir->i_size) return -ENOENT; qn.name = name->name; qn.end = name->name + name->len; buf.mapping = dir->i_mapping; ``` But after bindgen, since Rust does not support any kinds of anonymous unions here it just converts to this. ```rust #[repr(C)] #[derive(Copy, Clone)] pub struct qstr { pub __bindgen_anon_1: qstr__bindgen_ty_1, pub name: *const core::ffi::c_uchar, } #[repr(C)] #[derive(Copy, Clone)] pub union qstr__bindgen_ty_1 { pub __bindgen_anon_1: qstr__bindgen_ty_1__bindgen_ty_1, pub hash_len: u64_, } ``` And it just somehow degrades to this pure mess. :( I know this is stupid :( > Current erofs_lookup() (and your version as well) *is* indeed > safe in that respect, but the proof (from filesystem POV) is that "it's > called only as ->lookup() instance, so dentry is initially unhashed > negative and will remain such until it's passed to d_splice_alias(); > until that point it is guaranteed to have ->d_name and ->d_parent stable". > > Note that once you _have_ called d_splice_alias(), you can't > count upon the ->d_name stability - or, indeed, upon ->d_name.name you've > sampled still pointing to allocated memory. > > For directory-modifying methods it's "stable, since parent is held > exclusive". Some internal function called from different environments? > Well... Swear, look through the call graph and see what can be proven > for each. Sorry for my ignorance. I mean i just borrowed the code from the fs/erofs/namei.c and i directly translated that into Rust code. That might be a problem that also exists in original working C code. > Expressing that kind of fun in any kind of annotations (Rust type > system included) is not pleasant. _Probably_ might be handled by a type > that would be a dentry pointer with annotation along the lines "->d_name > and ->d_parent of that one are stable". Then e.g. ->lookup() would > take that thing as an argument and d_splice_alias() would consume it. > ->mkdir() would get the same thing, etc. I hadn't tried to get that > all way through (the amount of annotation churn in existing filesystems > would be high and hard to split into reviewable patch series), so there > might be dragons - and there definitely are places where the stability is > proven in different ways (e.g. if dentry->d_lock is held, we have the damn > thing stable; then there's a "take a safe snapshot of name" API; etc.). That's kinda interesting, I originally thought that VFS will make sure its d_name / d_parent is stable in the first place. Again, I just don't have a full picture or understanding of VFS and my code is just basic translation of original C code, Maybe we can address this later. > I want to reduce the PITA of regular code audits. If, at > some point, Rust use in parts of the tree reduces that - wonderful. > But then we'd better make sure that Rust-side uses _are_ safe, accurately > annotated and easy to grep for. Yeah, even a small portion of VFS gets abstracted can get things work smoothly, like inode and dentry data structure. From my understanding Rust can make some of stability issues you have mentioned compilation errors if it's correctly annotated. > Because we'll almost certainly need to > change method calling conventions at some points through all of that. > Even if it's just the annotation-level, any such contract change (it > is doable and quite a few had been done) will require going through > the instances and checking how much massage will be needed in those. > Opaque chunks like the above promise to make that very painful... Certainly needs a lot of energy and efforts. WE just need a guy who has full grasp of a lot of filesystems and Rust to solve this stuff altogether to avoid the abstraction degraded into some toy concepts.
On 2024/9/17 14:48, Yiyang Wu wrote: > On Mon, Sep 16, 2024 at 06:08:01PM GMT, Al Viro wrote: >> On Mon, Sep 16, 2024 at 09:56:29PM +0800, Yiyang Wu wrote: >>> +/// Lookup function for dentry-inode lookup replacement. >>> +#[no_mangle] >>> +pub unsafe extern "C" fn erofs_lookup_rust( >>> + k_inode: NonNull<inode>, >>> + dentry: NonNull<dentry>, >>> + _flags: c_uint, >>> +) -> *mut c_void { >>> + // SAFETY: We are sure that the inode is a Kernel Inode since alloc_inode is called >>> + let erofs_inode = unsafe { &*container_of!(k_inode.as_ptr(), KernelInode, k_inode) }; >> >> Ummm... A wrapper would be highly useful. And the reason why >> it's safe is different - your function is called only via ->i_op->lookup, >> the is only one instance of inode_operations that has that ->lookup >> method, and the only place where an inode gets ->i_op set to that >> is erofs_fill_inode(). Which is always passed erofs_inode::vfs_inode. >> > So my original intention behind this is that all vfs_inodes come from > that erofs_iget function and it's always gets initialized in this case > And this just followes the same convention here. I can document this > more precisely. I think Al just would like a wrapper here, like the current C EROFS_I(). >>> + // SAFETY: The super_block is initialized when the erofs_alloc_sbi_rust is called. >>> + let sbi = erofs_sbi(unsafe { NonNull::new(k_inode.as_ref().i_sb).unwrap() }); >> >> Again, that calls for a wrapper - this time not erofs-specific; >> inode->i_sb is *always* non-NULL, is assign-once and always points >> to live struct super_block instance at least until the call of >> destroy_inode(). >> > > Will be modified correctly, I'm not a native speaker and I just can't > find a better way, I will take my note here. Same here, like the current EROFS_I_SB(). >>> + // SAFETY: this is backed by qstr which is c representation of a valid slice. >> >> What is that sentence supposed to mean? Nevermind "why is it correct"... >> >>> + let name = unsafe { >>> + core::str::from_utf8_unchecked(core::slice::from_raw_parts( >>> + dentry.as_ref().d_name.name, >>> + dentry.as_ref().d_name.__bindgen_anon_1.__bindgen_anon_1.len as usize, >> ... > >> Current erofs_lookup() (and your version as well) *is* indeed >> safe in that respect, but the proof (from filesystem POV) is that "it's >> called only as ->lookup() instance, so dentry is initially unhashed >> negative and will remain such until it's passed to d_splice_alias(); >> until that point it is guaranteed to have ->d_name and ->d_parent stable". Agreed. >> >> Note that once you _have_ called d_splice_alias(), you can't >> count upon the ->d_name stability - or, indeed, upon ->d_name.name you've >> sampled still pointing to allocated memory. >> >> For directory-modifying methods it's "stable, since parent is held >> exclusive". Some internal function called from different environments? >> Well... Swear, look through the call graph and see what can be proven >> for each. > > Sorry for my ignorance. > I mean i just borrowed the code from the fs/erofs/namei.c and i directly > translated that into Rust code. That might be a problem that also > exists in original working C code. As for EROFS (an immutable fs), I think after d_splice_alias(), d_name is still stable (since we don't have rename semantics likewise for now). But as the generic filesystem POV, d_name access is actually tricky under RCU walk path indeed. > >> Expressing that kind of fun in any kind of annotations (Rust type >> system included) is not pleasant. _Probably_ might be handled by a type >> that would be a dentry pointer with annotation along the lines "->d_name >> and ->d_parent of that one are stable". Then e.g. ->lookup() would >> take that thing as an argument and d_splice_alias() would consume it. >> ->mkdir() would get the same thing, etc. I hadn't tried to get that >> all way through (the amount of annotation churn in existing filesystems >> would be high and hard to split into reviewable patch series), so there >> might be dragons - and there definitely are places where the stability is >> proven in different ways (e.g. if dentry->d_lock is held, we have the damn >> thing stable; then there's a "take a safe snapshot of name" API; etc.). > > That's kinda interesting, I originally thought that VFS will make sure > its d_name / d_parent is stable in the first place. > Again, I just don't have a full picture or understanding of VFS and my > code is just basic translation of original C code, Maybe we can address > this later. d_alloc will allocate an unhashed dentry which is almost unrecognized by VFS dcache (d_name is stable of course). After d_splice_alias() and d_add(), rename() could change d_name. So either we take d_lock or with rcu_read_lock() to take a snapshot of d_name in the RCU walk path. That is my overall understanding. But for EROFS, since we don't have rename, so it doesn't matter. Thanks, Gao Xiang
On Tue, Sep 17, 2024 at 03:14:58PM +0800, Gao Xiang wrote: > > Sorry for my ignorance. > > I mean i just borrowed the code from the fs/erofs/namei.c and i directly > > translated that into Rust code. That might be a problem that also > > exists in original working C code. > > As for EROFS (an immutable fs), I think after d_splice_alias(), d_name is > still stable (since we don't have rename semantics likewise for now). Even on corrupted images? If you have two directories with entries that act as hardlinks to the same subdirectory, and keep hitting them on lookups, it will have to transplant the subtree between the parents. > But as the generic filesystem POV, d_name access is actually tricky under > RCU walk path indeed. ->lookup() is never called in RCU mode. > > That's kinda interesting, I originally thought that VFS will make sure > > its d_name / d_parent is stable in the first place. > > Again, I just don't have a full picture or understanding of VFS and my > > code is just basic translation of original C code, Maybe we can address > > this later. > > d_alloc will allocate an unhashed dentry which is almost unrecognized > by VFS dcache (d_name is stable of course). > > After d_splice_alias() and d_add(), rename() could change d_name. So > either we take d_lock or with rcu_read_lock() to take a snapshot of > d_name in the RCU walk path. That is my overall understanding. No, it's more complicated than that, sadly. ->d_name and ->d_parent are the trickiest parts of dentry field stability. > But for EROFS, since we don't have rename, so it doesn't matter. See above. IF we could guarantee that all filesystem images are valid and will remain so, life would be much simpler.
On 2024/9/17 15:31, Al Viro wrote: > On Tue, Sep 17, 2024 at 03:14:58PM +0800, Gao Xiang wrote: > >>> Sorry for my ignorance. >>> I mean i just borrowed the code from the fs/erofs/namei.c and i directly >>> translated that into Rust code. That might be a problem that also >>> exists in original working C code. >> >> As for EROFS (an immutable fs), I think after d_splice_alias(), d_name is >> still stable (since we don't have rename semantics likewise for now). > > Even on corrupted images? If you have two directories with entries that > act as hardlinks to the same subdirectory, and keep hitting them on lookups, > it will have to transplant the subtree between the parents. Oh, I missed unexpected directory hardlink corrupted cases. > >> But as the generic filesystem POV, d_name access is actually tricky under >> RCU walk path indeed. > > ->lookup() is never called in RCU mode. I know, I just said d_name access is tricky in RCU walk. ->lookup() is for real lookup, not search dcache as fast cached lookup in the RCU context. Thanks, Gao Xiang
On Tue, Sep 17, 2024 at 08:31:49AM +0100, Al Viro wrote: > > After d_splice_alias() and d_add(), rename() could change d_name. So > > either we take d_lock or with rcu_read_lock() to take a snapshot of > > d_name in the RCU walk path. That is my overall understanding. > > No, it's more complicated than that, sadly. ->d_name and ->d_parent are > the trickiest parts of dentry field stability. > > > But for EROFS, since we don't have rename, so it doesn't matter. > > See above. IF we could guarantee that all filesystem images are valid > and will remain so, life would be much simpler. In any case, currently it is safe - d_splice_alias() is the last thing done by erofs_lookup(). Just don't assume that names can't change in there - and the fewer places in filesystem touch ->d_name, the better. In practice, for ->lookup() you are safe until after d_splice_alias() and for directory-modifying operations you are safe unless you start playing insane games with unlocking and relocking the parent directories (apparmorfs does; the locking is really obnoxious there). That covers the majority of ->d_name and ->d_parent accesses in filesystem code. ->d_hash() and ->d_compare() are separate story; I've posted a text on that last year (or this winter - not sure, will check once I get some sleep). d_path() et.al. are taking care to do the right thing; those (and %pd format) can be used safely. Anyway, I'm half-asleep at the moment and I'd rather leave writing these rules up until tomorrow. Sorry...
On Tue, Sep 17, 2024 at 08:44:29AM +0100, Al Viro wrote: > Anyway, I'm half-asleep at the moment and I'd rather leave writing these > rules up until tomorrow. Sorry... [Below are the bits of my notes related to d_name and d_parent, with most of the unprintable parts thrown out and minimal markup added. Probably not all relevant notes are here - this has been culled from a bunch of files sitting around and I might've missed some] NOTE: ->d_parent and ->d_name are by far the worst parts of dentry wrt stability rules. Code audits are not pleasant, to put it mildly. This covers the bits outside of VFS proper - verifying the locking rules is a separate story. A Really Blunt Tool You Should Not Use. ====================================== All changes of ->d_parent are serialized on rename_lock. It's *NOT* something you want the stuff outside of core VFS to touch, though. It's a seqlock, and write_seqlock() on it is limited to fs/dcache.c alone. Reader side is allowed, but it's still not something you want to use lightly - outside of fs/dcache.c, fs/d_path.c and fs/namei.c there are only 3 users (ceph_mdsc_build_path(), nfs_path() and auditsc handle_path()). Don't add more without a discussion on fsdevel and detailed ACKs; it's quite likely that a better solution will be found. With one exception (see the discussion of d_mark_tmpfile() in the end), ->d_name is also stabilized by that. Slightly Less Blunt Tool You Still Should Not Use. ================================================== ->s_vfs_rename_mutex will stabilize ->d_parent. The trouble is, while it's not system-wide like rename_lock, it's fs-wide, so there's a plenty of contention to run into *AND* if you try that while ->i_rwsem is held on some directory in that filesystem, you are fucked - lock_rename() (and rename(2), and...) will deadlock on you. Nothing outside of fs/{namei,dcache}.c touches it directly; there is an indirect use (lock_rename()), but that should be done only around the call of cross-directory rename on _another_ filesystem - overlayfs moving stuff within the writable layer, ecryptfs doing rename on underlying filesystem, nfsd handling rename request from client, etc. Anyone trying to use that hammer without a good reason will be very sorry - that's a promise. The pain will begin with the request to adjust the proof of deadlock avoidance in directory locking and it will only go downwards from there... Parent's ->i_rwsem Held Exclusive. ================================== That stabilizes ->d_parent and ->d_name. To be more precise, holding parent->d_inode->i_rwsem exclusive stabilizes the result of (dentry->d_parent == parent). That is to say, dentry that is a child of parent will remain such and dentry that isn't a child won't become a child. holding parent->d_inode->i_rwsem exclusive stabilizes ->d_name of all children. In other words, if you've locked the parent exclusive and found something to be its child while keeping the parent locked, child will have ->d_parent and ->d_name stable until you unlock the parent. That covers most of the directory-modifying methods - stuff like ->mkdir(), ->unlink(), ->rename(), etc. can access ->d_parent and ->d_name of the dentry argument(s) without any extra locks; the caller is already holding enough. Well, unless you are special (*cough* apparmor *cough*) and feel like dropping and regaining the lock on parent inside your ->mkdir()... Don't do that, please - you might have no renames, but there's a plenty of other headache you can get into that way. Negatives. ========== Negative dentry doesn't change ->d_parent or ->d_name. Of course, that is only worth something if you are guaranteed that it won't become positive under you - if that happens, all bets are off. Holding the inode of parent locked (at least) shared is enough to guarantee that. That takes care of ->lookup() instances - their dentry argument has ->d_parent and ->d_name stable until you make it positive (normally - by d_splice_alias()). Once you've done d_splice_alias(), you'd better be careful with access to those; you won't get hit by concurrent rename() (it locks parent(s) exclusive), but if your inode is a directory and d_splice_alias() *elsewhere* picks the same inode (fs image corruption, network filesystem with rename done from another client behind your back, etc.), you'll see the sucker moved. In practice, d_splice_alias() is the last thing done by most of ->lookup() instances - whatever it has returned gets returned by ->lookup() itself, possibly after freeing some temporary allocations, etc. The rest needs to watch out for accesses to ->d_name and ->d_parent downstream of d_splice_alias() return. Another case where we are guaranteed that dentry is negative and will stay so is ->d_release() - it's called for a dentry that is well on the way to becoming an ex-parrot; it's already marked dead, unhashed and negative. So a ->d_release() instance doesn't have to worry about ->d_name and ->d_parent - both are valid and stable. sprintf(). ========== %pd prints dentry name, safely. %p2d - parent_name/dentry_name, etc. up to %p4d. %pD .. %p4D do the same by file reference. Any time you see pr_warn("Some weird bollocks with %s (%d)\n", dentry->d_name, err); it should've been pr_warn("Some weird bollocks with %pd (%d)\n", dentry, err)... d_path() and friends are there for purpose - don't open-code those without a damn good reason. Checking if one dentry is an ancestor of another. ================================================= Use the damn is_subdir(), don't open-code it. Spinlocks. ========== dentry->d_lock stabilizes ->d_parent and ->d_name (as well as almost everything else about dentry); downside is that it's a spinlock *and* nesting it is not to be attempted easily; you are allowed to lock child while holding lock on parent, but very few places have any business doing that (only 2 outside of VFS - tree-walking in autofs, which might eventually get redone avoiding that and fsnotify_set_children_dentry_flags(), which just might get moved to fs/dcache.c itself; we'll see) Note that "almost everything" includes refcount; that is to say, dget() and dput() will spin if you are holding ->d_lock, so you can't dget() the parent under ->d_lock on child - that's a locking order violation that can easily deadlock on you. dget_parent() does that kind of thing safely, and a look at it might be instructive. Try to open-code something of that sort, and you'll be hurt. Said that, dget_parent() is overused - it has legitimate uses, but more often than not it's the wrong tool. In particular, while you grab a reference to something that was the parent at some point during dget_parent(), it might not be the parent anymore by the time it is returned to caller. Most of the dget_parent() uses are due to bad calling conventions of ->d_revalidate(). When that gets sanitized, those will be gone. The methods that are called with ->d_lock get the protection - that would be ->d_delete() and ->d_prune(). ->d_lock on parent is also sufficient; similar to exclusive ->i_rwsem, parent->d_lock stabilizes (dentry->d_parent == parent) and if child has been observed to have child->d_parent equal to parent after you've locked parent->d_lock, you know that child->d_{parent,name} will remain stable until you unlock the parent. Name Snapshots. =============== There's take_dentry_name_snapshot()/release_dentry_name_snapshot(). That stuff eats about 64 bytes on stack (longer names _are_ handled correctly; no allocations are needed, we can simply grab an extra reference to external name and hold it). Can't be done under ->d_lock, won't do anything about ->d_parent *and* there's nothing to prevent dentry being renamed while you are looking at the name snapshot. Sometimes it's useful... RCU Headaches. ============== See e.g. https://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git/commit/?h=fixes.pathwalk-rcu-2&id=8d0a75eba81813cbb00beb73a67783e1cde9982f (NB: ought to repost that) [*] d_mark_tmpfile() is pretty much a delayed bit of constructor. There is a possible intermediate state of dentry ("will be tmpfile one"); dentries in that state are all created in vfs_tmpfile(), get passed to ->tmpfile() where they transition to normal unhashed postive dentries. The reason why name is not set from the very beginning is that at that point we do not know the inumber of inode they are going to get (that becomes known inside ->tmpfile() instance) and we want that inumber seen in their names (for /proc/*/fd/*, basically). Name change is done while holding ->d_lock both on dentry itself and on its parent.
On 2024/9/17 15:44, Al Viro wrote: > On Tue, Sep 17, 2024 at 08:31:49AM +0100, Al Viro wrote: > >>> After d_splice_alias() and d_add(), rename() could change d_name. So >>> either we take d_lock or with rcu_read_lock() to take a snapshot of >>> d_name in the RCU walk path. That is my overall understanding. >> >> No, it's more complicated than that, sadly. ->d_name and ->d_parent are >> the trickiest parts of dentry field stability. >> >>> But for EROFS, since we don't have rename, so it doesn't matter. >> >> See above. IF we could guarantee that all filesystem images are valid >> and will remain so, life would be much simpler. > > In any case, currently it is safe - d_splice_alias() is the last thing > done by erofs_lookup(). Just don't assume that names can't change in > there - and the fewer places in filesystem touch ->d_name, the better. > > In practice, for ->lookup() you are safe until after d_splice_alias() > and for directory-modifying operations you are safe unless you start > playing insane games with unlocking and relocking the parent directories > (apparmorfs does; the locking is really obnoxious there). That covers > the majority of ->d_name and ->d_parent accesses in filesystem code. > > ->d_hash() and ->d_compare() are separate story; I've posted a text on > that last year (or this winter - not sure, will check once I get some > sleep). > > d_path() et.al. are taking care to do the right thing; those (and %pd > format) can be used safely. > > Anyway, I'm half-asleep at the moment and I'd rather leave writing these > rules up until tomorrow. Sorry... Agreed, thanks for writing so many words on this! Thanks, Gao Xiang
© 2016 - 2024 Red Hat, Inc.