During the transition from providing exclusive locking on the directory
for directory modifying operation to providing exclusive locking only on
the dentry with a shared lock on the directory - we need an alternate
way to provide exclusion on the directory for file systems which haven't
been converted. This is provided by inode_dir_lock() and
inode_dir_inlock().
This uses a bit in i_state for locking, and wait_var_event_spinlock() for
waiting.
Signed-off-by: NeilBrown <neilb@suse.de>
---
fs/inode.c | 3 ++
fs/namei.c | 81 +++++++++++++++++++++++++++++++++++++---------
include/linux/fs.h | 5 +++
3 files changed, 74 insertions(+), 15 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 6b4c77268fc0..9ba69837aa56 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -492,6 +492,8 @@ EXPORT_SYMBOL(address_space_init_once);
*/
void inode_init_once(struct inode *inode)
{
+ static struct lock_class_key __key;
+
memset(inode, 0, sizeof(*inode));
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_devices);
@@ -501,6 +503,7 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_sb_list);
__address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
+ lockdep_init_map(&inode->i_dirlock_map, "I_DIR_LOCKED", &__key, 0);
}
EXPORT_SYMBOL(inode_init_once);
diff --git a/fs/namei.c b/fs/namei.c
index 371c80902c59..68750b15dbf4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3364,6 +3364,34 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
return mode;
}
+static bool check_dir_locked(struct inode *dir)
+{
+ if (dir->i_state & I_DIR_LOCKED) {
+ dir->i_state |= I_DIR_LOCK_WAITER;
+ return true;
+ }
+ return false;
+}
+
+static void inode_lock_dir(struct inode *dir)
+{
+ lock_acquire_exclusive(&dir->i_dirlock_map, 0, 0, NULL, _THIS_IP_);
+ spin_lock(&dir->i_lock);
+ wait_var_event_spinlock(dir, !check_dir_locked(dir),
+ &dir->i_lock);
+ dir->i_state |= I_DIR_LOCKED;
+ spin_unlock(&dir->i_lock);
+}
+
+static void inode_unlock_dir(struct inode *dir)
+{
+ lock_map_release(&dir->i_dirlock_map);
+ spin_lock(&dir->i_lock);
+ dir->i_state &= ~(I_DIR_LOCKED | I_DIR_LOCK_WAITER);
+ wake_up_var_locked(dir, &dir->i_lock);
+ spin_unlock(&dir->i_lock);
+}
+
/**
* vfs_create - create new file
* @idmap: idmap of the mount the inode was found from
@@ -3396,10 +3424,13 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
- if (dir->i_op->create_shared)
+ if (dir->i_op->create_shared) {
error = dir->i_op->create_shared(idmap, dir, dentry, mode, want_excl);
- else
+ } else {
+ inode_lock_dir(dir);
error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
+ inode_unlock_dir(dir);
+ }
if (!error)
fsnotify_create(dir, dentry);
return error;
@@ -3699,16 +3730,19 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
file->f_mode |= FMODE_CREATED;
audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
- if (dir_inode->i_op->create_shared)
+ if (dir_inode->i_op->create_shared) {
error = dir_inode->i_op->create_shared(idmap, dir_inode,
dentry, mode,
open_flag & O_EXCL);
- else if (dir_inode->i_op->create)
+ } else if (dir_inode->i_op->create) {
+ inode_lock_dir(dir_inode);
error = dir_inode->i_op->create(idmap, dir_inode,
dentry, mode,
open_flag & O_EXCL);
- else
+ inode_unlock_dir(dir_inode);
+ } else {
error = -EACCES;
+ }
if (error)
goto out_dput;
}
@@ -4227,10 +4261,13 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
if (error)
return error;
- if (dir->i_op->mknod_shared)
+ if (dir->i_op->mknod_shared) {
error = dir->i_op->mknod_shared(idmap, dir, dentry, mode, dev);
- else
+ } else {
+ inode_lock_dir(dir);
error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
+ inode_unlock_dir(dir);
+ }
if (!error)
fsnotify_create(dir, dentry);
return error;
@@ -4360,7 +4397,9 @@ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
else if (de)
dput(de);
} else {
+ inode_lock_dir(dir);
error = dir->i_op->mkdir(idmap, dir, dentry, mode);
+ inode_unlock_dir(dir);
}
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -4521,10 +4560,13 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
if (error)
goto out;
- if (dir->i_op->rmdir_shared)
+ if (dir->i_op->rmdir_shared) {
error = dir->i_op->rmdir_shared(dir, dentry);
- else
+ } else {
+ inode_lock_dir(dir);
error = dir->i_op->rmdir(dir, dentry);
+ inode_unlock_dir(dir);
+ }
if (error)
goto out;
@@ -4648,10 +4690,13 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
error = try_break_deleg(target, delegated_inode);
if (error)
goto out;
- if (dir->i_op->unlink_shared)
+ if (dir->i_op->unlink_shared) {
error = dir->i_op->unlink_shared(dir, dentry);
- else
+ } else {
+ inode_lock_dir(dir);
error = dir->i_op->unlink(dir, dentry);
+ inode_unlock_dir(dir);
+ }
if (!error) {
dont_mount(dentry);
detach_mounts(dentry);
@@ -4792,10 +4837,13 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
if (error)
return error;
- if (dir->i_op->symlink_shared)
+ if (dir->i_op->symlink_shared) {
error = dir->i_op->symlink_shared(idmap, dir, dentry, oldname);
- else
+ } else {
+ inode_lock_dir(dir);
error = dir->i_op->symlink(idmap, dir, dentry, oldname);
+ inode_unlock_dir(dir);
+ }
if (!error)
fsnotify_create(dir, dentry);
return error;
@@ -4920,10 +4968,13 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
error = try_break_deleg(inode, delegated_inode);
if (error)
;
- else if (dir->i_op->link_shared)
+ else if (dir->i_op->link_shared) {
error = dir->i_op->link_shared(old_dentry, dir, new_dentry);
- else
+ } else {
+ inode_lock_dir(dir);
error = dir->i_op->link(old_dentry, dir, new_dentry);
+ inode_unlock_dir(dir);
+ }
}
if (!error && (inode->i_state & I_LINKABLE)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68eba181175b..3ca92a54f28e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -722,6 +722,8 @@ struct inode {
void (*free_inode)(struct inode *);
};
struct file_lock_context *i_flctx;
+
+ struct lockdep_map i_dirlock_map; /* For tracking I_DIR_LOCKED locks */
struct address_space i_data;
struct list_head i_devices;
union {
@@ -2493,6 +2495,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
#define I_SYNC_QUEUED (1 << 16)
#define I_PINNING_NETFS_WB (1 << 17)
+#define I_DIR_LOCK_WAITER (1 << 30)
+#define I_DIR_LOCKED (1 << 31)
+
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
--
2.47.0
On Fri, 20 Dec 2024 13:54:26 +1100 NeilBrown <neilb@suse.de>
> During the transition from providing exclusive locking on the directory
> for directory modifying operation to providing exclusive locking only on
> the dentry with a shared lock on the directory - we need an alternate
> way to provide exclusion on the directory for file systems which haven't
> been converted. This is provided by inode_dir_lock() and
> inode_dir_inlock().
> This uses a bit in i_state for locking, and wait_var_event_spinlock() for
> waiting.
>
Inventing anything like mutex sounds bad.
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
> fs/inode.c | 3 ++
> fs/namei.c | 81 +++++++++++++++++++++++++++++++++++++---------
> include/linux/fs.h | 5 +++
> 3 files changed, 74 insertions(+), 15 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 6b4c77268fc0..9ba69837aa56 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -492,6 +492,8 @@ EXPORT_SYMBOL(address_space_init_once);
> */
> void inode_init_once(struct inode *inode)
> {
> + static struct lock_class_key __key;
> +
> memset(inode, 0, sizeof(*inode));
> INIT_HLIST_NODE(&inode->i_hash);
> INIT_LIST_HEAD(&inode->i_devices);
> @@ -501,6 +503,7 @@ void inode_init_once(struct inode *inode)
> INIT_LIST_HEAD(&inode->i_sb_list);
> __address_space_init_once(&inode->i_data);
> i_size_ordered_init(inode);
> + lockdep_init_map(&inode->i_dirlock_map, "I_DIR_LOCKED", &__key, 0);
> }
> EXPORT_SYMBOL(inode_init_once);
>
> diff --git a/fs/namei.c b/fs/namei.c
> index 371c80902c59..68750b15dbf4 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -3364,6 +3364,34 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
> return mode;
> }
>
> +static bool check_dir_locked(struct inode *dir)
> +{
> + if (dir->i_state & I_DIR_LOCKED) {
> + dir->i_state |= I_DIR_LOCK_WAITER;
> + return true;
> + }
> + return false;
> +}
> +
> +static void inode_lock_dir(struct inode *dir)
> +{
> + lock_acquire_exclusive(&dir->i_dirlock_map, 0, 0, NULL, _THIS_IP_);
> + spin_lock(&dir->i_lock);
> + wait_var_event_spinlock(dir, !check_dir_locked(dir),
> + &dir->i_lock);
> + dir->i_state |= I_DIR_LOCKED;
> + spin_unlock(&dir->i_lock);
> +}
> +
> +static void inode_unlock_dir(struct inode *dir)
> +{
> + lock_map_release(&dir->i_dirlock_map);
> + spin_lock(&dir->i_lock);
> + dir->i_state &= ~(I_DIR_LOCKED | I_DIR_LOCK_WAITER);
> + wake_up_var_locked(dir, &dir->i_lock);
> + spin_unlock(&dir->i_lock);
> +}
> +
> /**
> * vfs_create - create new file
> * @idmap: idmap of the mount the inode was found from
> @@ -3396,10 +3424,13 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
> error = security_inode_create(dir, dentry, mode);
> if (error)
> return error;
> - if (dir->i_op->create_shared)
> + if (dir->i_op->create_shared) {
> error = dir->i_op->create_shared(idmap, dir, dentry, mode, want_excl);
> - else
> + } else {
> + inode_lock_dir(dir);
> error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
> + inode_unlock_dir(dir);
> + }
> if (!error)
> fsnotify_create(dir, dentry);
> return error;
> @@ -3699,16 +3730,19 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
> file->f_mode |= FMODE_CREATED;
> audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
>
> - if (dir_inode->i_op->create_shared)
> + if (dir_inode->i_op->create_shared) {
> error = dir_inode->i_op->create_shared(idmap, dir_inode,
> dentry, mode,
> open_flag & O_EXCL);
> - else if (dir_inode->i_op->create)
> + } else if (dir_inode->i_op->create) {
> + inode_lock_dir(dir_inode);
> error = dir_inode->i_op->create(idmap, dir_inode,
> dentry, mode,
> open_flag & O_EXCL);
> - else
> + inode_unlock_dir(dir_inode);
> + } else {
> error = -EACCES;
> + }
> if (error)
> goto out_dput;
> }
> @@ -4227,10 +4261,13 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
> if (error)
> return error;
>
> - if (dir->i_op->mknod_shared)
> + if (dir->i_op->mknod_shared) {
> error = dir->i_op->mknod_shared(idmap, dir, dentry, mode, dev);
> - else
> + } else {
> + inode_lock_dir(dir);
> error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
> + inode_unlock_dir(dir);
> + }
> if (!error)
> fsnotify_create(dir, dentry);
> return error;
> @@ -4360,7 +4397,9 @@ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
> else if (de)
> dput(de);
> } else {
> + inode_lock_dir(dir);
> error = dir->i_op->mkdir(idmap, dir, dentry, mode);
> + inode_unlock_dir(dir);
> }
> if (!error)
> fsnotify_mkdir(dir, dentry);
> @@ -4521,10 +4560,13 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
> if (error)
> goto out;
>
> - if (dir->i_op->rmdir_shared)
> + if (dir->i_op->rmdir_shared) {
> error = dir->i_op->rmdir_shared(dir, dentry);
> - else
> + } else {
> + inode_lock_dir(dir);
> error = dir->i_op->rmdir(dir, dentry);
> + inode_unlock_dir(dir);
> + }
> if (error)
> goto out;
>
> @@ -4648,10 +4690,13 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
> error = try_break_deleg(target, delegated_inode);
> if (error)
> goto out;
> - if (dir->i_op->unlink_shared)
> + if (dir->i_op->unlink_shared) {
> error = dir->i_op->unlink_shared(dir, dentry);
> - else
> + } else {
> + inode_lock_dir(dir);
> error = dir->i_op->unlink(dir, dentry);
> + inode_unlock_dir(dir);
> + }
> if (!error) {
> dont_mount(dentry);
> detach_mounts(dentry);
> @@ -4792,10 +4837,13 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
> if (error)
> return error;
>
> - if (dir->i_op->symlink_shared)
> + if (dir->i_op->symlink_shared) {
> error = dir->i_op->symlink_shared(idmap, dir, dentry, oldname);
> - else
> + } else {
> + inode_lock_dir(dir);
> error = dir->i_op->symlink(idmap, dir, dentry, oldname);
> + inode_unlock_dir(dir);
> + }
> if (!error)
> fsnotify_create(dir, dentry);
> return error;
> @@ -4920,10 +4968,13 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
> error = try_break_deleg(inode, delegated_inode);
> if (error)
> ;
> - else if (dir->i_op->link_shared)
> + else if (dir->i_op->link_shared) {
> error = dir->i_op->link_shared(old_dentry, dir, new_dentry);
> - else
> + } else {
> + inode_lock_dir(dir);
> error = dir->i_op->link(old_dentry, dir, new_dentry);
> + inode_unlock_dir(dir);
> + }
> }
>
> if (!error && (inode->i_state & I_LINKABLE)) {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 68eba181175b..3ca92a54f28e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -722,6 +722,8 @@ struct inode {
> void (*free_inode)(struct inode *);
> };
> struct file_lock_context *i_flctx;
> +
> + struct lockdep_map i_dirlock_map; /* For tracking I_DIR_LOCKED locks */
The cost of this map says no to any attempt inventing mutex in any form.
On Sat, 21 Dec 2024, Hillf Danton wrote:
> On Fri, 20 Dec 2024 13:54:26 +1100 NeilBrown <neilb@suse.de>
> > During the transition from providing exclusive locking on the directory
> > for directory modifying operation to providing exclusive locking only on
> > the dentry with a shared lock on the directory - we need an alternate
> > way to provide exclusion on the directory for file systems which haven't
> > been converted. This is provided by inode_dir_lock() and
> > inode_dir_inlock().
> > This uses a bit in i_state for locking, and wait_var_event_spinlock() for
> > waiting.
> >
> Inventing anything like mutex sounds bad.
In general I would agree. But when the cost of adding a mutex exceeds
the cost of using an alternate solution that only requires 2 bits, I
think the alternate solution is justified.
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -722,6 +722,8 @@ struct inode {
> > void (*free_inode)(struct inode *);
> > };
> > struct file_lock_context *i_flctx;
> > +
> > + struct lockdep_map i_dirlock_map; /* For tracking I_DIR_LOCKED locks */
>
> The cost of this map says no to any attempt inventing mutex in any form.
>
"struct lockdep_map" is size-zero when lockdep is not enabled. And when
it is enabled we accept the cost of larger structures to benefit from
deadlock detection.
So I don't think this is a sound argument.
Thanks for the review,
NeilBrown
On Mon, 23 Dec 2024 14:10:07 +1100 NeilBrown <neilb@suse.de> > On Sat, 21 Dec 2024, Hillf Danton wrote: > > Inventing anything like mutex sounds bad. > > In general I would agree. But when the cost of adding a mutex exceeds > the cost of using an alternate solution that only requires 2 bits, I > think the alternate solution is justified. > Inode deserves more than the 2 bits before such a solution is able to rework mutex.
On Mon, 23 Dec 2024, Hillf Danton wrote: > On Mon, 23 Dec 2024 14:10:07 +1100 NeilBrown <neilb@suse.de> > > On Sat, 21 Dec 2024, Hillf Danton wrote: > > > Inventing anything like mutex sounds bad. > > > > In general I would agree. But when the cost of adding a mutex exceeds > > the cost of using an alternate solution that only requires 2 bits, I > > think the alternate solution is justified. > > > Inode deserves more than the 2 bits before such a solution is able to > rework mutex. I'm sorry but I don't understand what you are saying. Could you please give more details about your concern? Are you concerned about correctness? Performance? Maintainability? Something else? Thanks, NeilBrown
On Tue, 24 Dec 2024 07:36:08 +1100 NeilBrown <neilb@suse.de> > On Mon, 23 Dec 2024, Hillf Danton wrote: > > On Mon, 23 Dec 2024 14:10:07 +1100 NeilBrown <neilb@suse.de> > > > On Sat, 21 Dec 2024, Hillf Danton wrote: > > > > Inventing anything like mutex sounds bad. > > > > > > In general I would agree. But when the cost of adding a mutex exceeds > > > the cost of using an alternate solution that only requires 2 bits, I > > > think the alternate solution is justified. > > > > > Inode deserves more than the 2 bits before such a solution is able to > > rework mutex. > > I'm sorry but I don't understand what you are saying. Could you please > give more details about your concern? > Are you concerned about correctness? Performance? Maintainability? > Something else? > It is that you are adding a pair of honey bee wings to A380, so no takeoff can be expected.
© 2016 - 2026 Red Hat, Inc.