Allow concurrent changes in a directory

[PATCH 08/11] VFS: add inode_dir_lock/unlock

Posted by NeilBrown 1 year, 1 month ago

During the transition from providing exclusive locking on the directory
for directory modifying operation to providing exclusive locking only on
the dentry with a shared lock on the directory - we need an alternate
way to provide exclusion on the directory for file systems which haven't
been converted.  This is provided by inode_dir_lock() and
inode_dir_inlock().
This uses a bit in i_state for locking, and wait_var_event_spinlock() for
waiting.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/inode.c         |  3 ++
 fs/namei.c         | 81 +++++++++++++++++++++++++++++++++++++---------
 include/linux/fs.h |  5 +++
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 6b4c77268fc0..9ba69837aa56 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -492,6 +492,8 @@ EXPORT_SYMBOL(address_space_init_once);
  */
 void inode_init_once(struct inode *inode)
 {
+	static struct lock_class_key __key;
+
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_devices);
@@ -501,6 +503,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_sb_list);
 	__address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
+	lockdep_init_map(&inode->i_dirlock_map, "I_DIR_LOCKED", &__key, 0);
 }
 EXPORT_SYMBOL(inode_init_once);
 
diff --git a/fs/namei.c b/fs/namei.c
index 371c80902c59..68750b15dbf4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3364,6 +3364,34 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
 	return mode;
 }
 
+static bool check_dir_locked(struct inode *dir)
+{
+	if (dir->i_state & I_DIR_LOCKED) {
+		dir->i_state |= I_DIR_LOCK_WAITER;
+		return true;
+	}
+	return false;
+}
+
+static void inode_lock_dir(struct inode *dir)
+{
+	lock_acquire_exclusive(&dir->i_dirlock_map, 0, 0, NULL, _THIS_IP_);
+	spin_lock(&dir->i_lock);
+	wait_var_event_spinlock(dir, !check_dir_locked(dir),
+				&dir->i_lock);
+	dir->i_state |= I_DIR_LOCKED;
+	spin_unlock(&dir->i_lock);
+}
+
+static void inode_unlock_dir(struct inode *dir)
+{
+	lock_map_release(&dir->i_dirlock_map);
+	spin_lock(&dir->i_lock);
+	dir->i_state &= ~(I_DIR_LOCKED | I_DIR_LOCK_WAITER);
+	wake_up_var_locked(dir, &dir->i_lock);
+	spin_unlock(&dir->i_lock);
+}
+
 /**
  * vfs_create - create new file
  * @idmap:	idmap of the mount the inode was found from
@@ -3396,10 +3424,13 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	error = security_inode_create(dir, dentry, mode);
 	if (error)
 		return error;
-	if (dir->i_op->create_shared)
+	if (dir->i_op->create_shared) {
 		error = dir->i_op->create_shared(idmap, dir, dentry, mode, want_excl);
-	else
+	} else {
+		inode_lock_dir(dir);
 		error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
+		inode_unlock_dir(dir);
+	}
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -3699,16 +3730,19 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 		file->f_mode |= FMODE_CREATED;
 		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-		if (dir_inode->i_op->create_shared)
+		if (dir_inode->i_op->create_shared) {
 			error = dir_inode->i_op->create_shared(idmap, dir_inode,
 							       dentry, mode,
 							       open_flag & O_EXCL);
-		else if (dir_inode->i_op->create)
+		} else if (dir_inode->i_op->create) {
+			inode_lock_dir(dir_inode);
 			error = dir_inode->i_op->create(idmap, dir_inode,
 							dentry, mode,
 							open_flag & O_EXCL);
-		else
+			inode_unlock_dir(dir_inode);
+		} else {
 			error = -EACCES;
+		}
 		if (error)
 			goto out_dput;
 	}
@@ -4227,10 +4261,13 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		return error;
 
-	if (dir->i_op->mknod_shared)
+	if (dir->i_op->mknod_shared) {
 		error = dir->i_op->mknod_shared(idmap, dir, dentry, mode, dev);
-	else
+	} else {
+		inode_lock_dir(dir);
 		error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
+		inode_unlock_dir(dir);
+	}
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -4360,7 +4397,9 @@ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		else if (de)
 			dput(de);
 	} else {
+		inode_lock_dir(dir);
 		error = dir->i_op->mkdir(idmap, dir, dentry, mode);
+		inode_unlock_dir(dir);
 	}
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -4521,10 +4560,13 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		goto out;
 
-	if (dir->i_op->rmdir_shared)
+	if (dir->i_op->rmdir_shared) {
 		error = dir->i_op->rmdir_shared(dir, dentry);
-	else
+	} else {
+		inode_lock_dir(dir);
 		error = dir->i_op->rmdir(dir, dentry);
+		inode_unlock_dir(dir);
+	}
 	if (error)
 		goto out;
 
@@ -4648,10 +4690,13 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
 			error = try_break_deleg(target, delegated_inode);
 			if (error)
 				goto out;
-			if (dir->i_op->unlink_shared)
+			if (dir->i_op->unlink_shared) {
 				error = dir->i_op->unlink_shared(dir, dentry);
-			else
+			} else {
+				inode_lock_dir(dir);
 				error = dir->i_op->unlink(dir, dentry);
+				inode_unlock_dir(dir);
+			}
 			if (!error) {
 				dont_mount(dentry);
 				detach_mounts(dentry);
@@ -4792,10 +4837,13 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		return error;
 
-	if (dir->i_op->symlink_shared)
+	if (dir->i_op->symlink_shared) {
 		error = dir->i_op->symlink_shared(idmap, dir, dentry, oldname);
-	else
+	} else {
+		inode_lock_dir(dir);
 		error = dir->i_op->symlink(idmap, dir, dentry, oldname);
+		inode_unlock_dir(dir);
+	}
 	if (!error)
 		fsnotify_create(dir, dentry);
 	return error;
@@ -4920,10 +4968,13 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
 		error = try_break_deleg(inode, delegated_inode);
 		if (error)
 			;
-		else if (dir->i_op->link_shared)
+		else if (dir->i_op->link_shared) {
 			error = dir->i_op->link_shared(old_dentry, dir, new_dentry);
-		else
+		} else {
+			inode_lock_dir(dir);
 			error = dir->i_op->link(old_dentry, dir, new_dentry);
+			inode_unlock_dir(dir);
+		}
 	}
 
 	if (!error && (inode->i_state & I_LINKABLE)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68eba181175b..3ca92a54f28e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -722,6 +722,8 @@ struct inode {
 		void (*free_inode)(struct inode *);
 	};
 	struct file_lock_context	*i_flctx;
+
+	struct lockdep_map	i_dirlock_map;	/* For tracking I_DIR_LOCKED locks */
 	struct address_space	i_data;
 	struct list_head	i_devices;
 	union {
@@ -2493,6 +2495,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 #define I_SYNC_QUEUED		(1 << 16)
 #define I_PINNING_NETFS_WB	(1 << 17)
 
+#define I_DIR_LOCK_WAITER	(1 << 30)
+#define I_DIR_LOCKED		(1 << 31)
+
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
 #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
-- 
2.47.0

Re: [PATCH 08/11] VFS: add inode_dir_lock/unlock

Posted by Hillf Danton 1 year, 1 month ago

On Fri, 20 Dec 2024 13:54:26 +1100 NeilBrown <neilb@suse.de>
> During the transition from providing exclusive locking on the directory
> for directory modifying operation to providing exclusive locking only on
> the dentry with a shared lock on the directory - we need an alternate
> way to provide exclusion on the directory for file systems which haven't
> been converted.  This is provided by inode_dir_lock() and
> inode_dir_inlock().
> This uses a bit in i_state for locking, and wait_var_event_spinlock() for
> waiting.
> 
Inventing anything like mutex sounds bad.

> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/inode.c         |  3 ++
>  fs/namei.c         | 81 +++++++++++++++++++++++++++++++++++++---------
>  include/linux/fs.h |  5 +++
>  3 files changed, 74 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index 6b4c77268fc0..9ba69837aa56 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -492,6 +492,8 @@ EXPORT_SYMBOL(address_space_init_once);
>   */
>  void inode_init_once(struct inode *inode)
>  {
> +	static struct lock_class_key __key;
> +
>  	memset(inode, 0, sizeof(*inode));
>  	INIT_HLIST_NODE(&inode->i_hash);
>  	INIT_LIST_HEAD(&inode->i_devices);
> @@ -501,6 +503,7 @@ void inode_init_once(struct inode *inode)
>  	INIT_LIST_HEAD(&inode->i_sb_list);
>  	__address_space_init_once(&inode->i_data);
>  	i_size_ordered_init(inode);
> +	lockdep_init_map(&inode->i_dirlock_map, "I_DIR_LOCKED", &__key, 0);
>  }
>  EXPORT_SYMBOL(inode_init_once);
>  
> diff --git a/fs/namei.c b/fs/namei.c
> index 371c80902c59..68750b15dbf4 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -3364,6 +3364,34 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
>  	return mode;
>  }
>  
> +static bool check_dir_locked(struct inode *dir)
> +{
> +	if (dir->i_state & I_DIR_LOCKED) {
> +		dir->i_state |= I_DIR_LOCK_WAITER;
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static void inode_lock_dir(struct inode *dir)
> +{
> +	lock_acquire_exclusive(&dir->i_dirlock_map, 0, 0, NULL, _THIS_IP_);
> +	spin_lock(&dir->i_lock);
> +	wait_var_event_spinlock(dir, !check_dir_locked(dir),
> +				&dir->i_lock);
> +	dir->i_state |= I_DIR_LOCKED;
> +	spin_unlock(&dir->i_lock);
> +}
> +
> +static void inode_unlock_dir(struct inode *dir)
> +{
> +	lock_map_release(&dir->i_dirlock_map);
> +	spin_lock(&dir->i_lock);
> +	dir->i_state &= ~(I_DIR_LOCKED | I_DIR_LOCK_WAITER);
> +	wake_up_var_locked(dir, &dir->i_lock);
> +	spin_unlock(&dir->i_lock);
> +}
> +
>  /**
>   * vfs_create - create new file
>   * @idmap:	idmap of the mount the inode was found from
> @@ -3396,10 +3424,13 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
>  	error = security_inode_create(dir, dentry, mode);
>  	if (error)
>  		return error;
> -	if (dir->i_op->create_shared)
> +	if (dir->i_op->create_shared) {
>  		error = dir->i_op->create_shared(idmap, dir, dentry, mode, want_excl);
> -	else
> +	} else {
> +		inode_lock_dir(dir);
>  		error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
> +		inode_unlock_dir(dir);
> +	}
>  	if (!error)
>  		fsnotify_create(dir, dentry);
>  	return error;
> @@ -3699,16 +3730,19 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
>  		file->f_mode |= FMODE_CREATED;
>  		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
>  
> -		if (dir_inode->i_op->create_shared)
> +		if (dir_inode->i_op->create_shared) {
>  			error = dir_inode->i_op->create_shared(idmap, dir_inode,
>  							       dentry, mode,
>  							       open_flag & O_EXCL);
> -		else if (dir_inode->i_op->create)
> +		} else if (dir_inode->i_op->create) {
> +			inode_lock_dir(dir_inode);
>  			error = dir_inode->i_op->create(idmap, dir_inode,
>  							dentry, mode,
>  							open_flag & O_EXCL);
> -		else
> +			inode_unlock_dir(dir_inode);
> +		} else {
>  			error = -EACCES;
> +		}
>  		if (error)
>  			goto out_dput;
>  	}
> @@ -4227,10 +4261,13 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
>  	if (error)
>  		return error;
>  
> -	if (dir->i_op->mknod_shared)
> +	if (dir->i_op->mknod_shared) {
>  		error = dir->i_op->mknod_shared(idmap, dir, dentry, mode, dev);
> -	else
> +	} else {
> +		inode_lock_dir(dir);
>  		error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
> +		inode_unlock_dir(dir);
> +	}
>  	if (!error)
>  		fsnotify_create(dir, dentry);
>  	return error;
> @@ -4360,7 +4397,9 @@ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
>  		else if (de)
>  			dput(de);
>  	} else {
> +		inode_lock_dir(dir);
>  		error = dir->i_op->mkdir(idmap, dir, dentry, mode);
> +		inode_unlock_dir(dir);
>  	}
>  	if (!error)
>  		fsnotify_mkdir(dir, dentry);
> @@ -4521,10 +4560,13 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
>  	if (error)
>  		goto out;
>  
> -	if (dir->i_op->rmdir_shared)
> +	if (dir->i_op->rmdir_shared) {
>  		error = dir->i_op->rmdir_shared(dir, dentry);
> -	else
> +	} else {
> +		inode_lock_dir(dir);
>  		error = dir->i_op->rmdir(dir, dentry);
> +		inode_unlock_dir(dir);
> +	}
>  	if (error)
>  		goto out;
>  
> @@ -4648,10 +4690,13 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
>  			error = try_break_deleg(target, delegated_inode);
>  			if (error)
>  				goto out;
> -			if (dir->i_op->unlink_shared)
> +			if (dir->i_op->unlink_shared) {
>  				error = dir->i_op->unlink_shared(dir, dentry);
> -			else
> +			} else {
> +				inode_lock_dir(dir);
>  				error = dir->i_op->unlink(dir, dentry);
> +				inode_unlock_dir(dir);
> +			}
>  			if (!error) {
>  				dont_mount(dentry);
>  				detach_mounts(dentry);
> @@ -4792,10 +4837,13 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
>  	if (error)
>  		return error;
>  
> -	if (dir->i_op->symlink_shared)
> +	if (dir->i_op->symlink_shared) {
>  		error = dir->i_op->symlink_shared(idmap, dir, dentry, oldname);
> -	else
> +	} else {
> +		inode_lock_dir(dir);
>  		error = dir->i_op->symlink(idmap, dir, dentry, oldname);
> +		inode_unlock_dir(dir);
> +	}
>  	if (!error)
>  		fsnotify_create(dir, dentry);
>  	return error;
> @@ -4920,10 +4968,13 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
>  		error = try_break_deleg(inode, delegated_inode);
>  		if (error)
>  			;
> -		else if (dir->i_op->link_shared)
> +		else if (dir->i_op->link_shared) {
>  			error = dir->i_op->link_shared(old_dentry, dir, new_dentry);
> -		else
> +		} else {
> +			inode_lock_dir(dir);
>  			error = dir->i_op->link(old_dentry, dir, new_dentry);
> +			inode_unlock_dir(dir);
> +		}
>  	}
>  
>  	if (!error && (inode->i_state & I_LINKABLE)) {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 68eba181175b..3ca92a54f28e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -722,6 +722,8 @@ struct inode {
>  		void (*free_inode)(struct inode *);
>  	};
>  	struct file_lock_context	*i_flctx;
> +
> +	struct lockdep_map	i_dirlock_map;	/* For tracking I_DIR_LOCKED locks */

The cost of this map says no to any attempt inventing mutex in any form.

Re: [PATCH 08/11] VFS: add inode_dir_lock/unlock

Posted by NeilBrown 1 year, 1 month ago

On Sat, 21 Dec 2024, Hillf Danton wrote:
> On Fri, 20 Dec 2024 13:54:26 +1100 NeilBrown <neilb@suse.de>
> > During the transition from providing exclusive locking on the directory
> > for directory modifying operation to providing exclusive locking only on
> > the dentry with a shared lock on the directory - we need an alternate
> > way to provide exclusion on the directory for file systems which haven't
> > been converted.  This is provided by inode_dir_lock() and
> > inode_dir_inlock().
> > This uses a bit in i_state for locking, and wait_var_event_spinlock() for
> > waiting.
> > 
> Inventing anything like mutex sounds bad.

In general I would agree.  But when the cost of adding a mutex exceeds
the cost of using an alternate solution that only requires 2 bits, I
think the alternate solution is justified.

> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -722,6 +722,8 @@ struct inode {
> >  		void (*free_inode)(struct inode *);
> >  	};
> >  	struct file_lock_context	*i_flctx;
> > +
> > +	struct lockdep_map	i_dirlock_map;	/* For tracking I_DIR_LOCKED locks */
> 
> The cost of this map says no to any attempt inventing mutex in any form.
> 

"struct lockdep_map" is size-zero when lockdep is not enabled.  And when
it is enabled we accept the cost of larger structures to benefit from
deadlock detection.
So I don't think this is a sound argument.

Thanks for the review,
NeilBrown

Re: [PATCH 08/11] VFS: add inode_dir_lock/unlock

Posted by Hillf Danton 1 year, 1 month ago

On Mon, 23 Dec 2024 14:10:07 +1100 NeilBrown <neilb@suse.de>
> On Sat, 21 Dec 2024, Hillf Danton wrote:
> > Inventing anything like mutex sounds bad.
> 
> In general I would agree.  But when the cost of adding a mutex exceeds
> the cost of using an alternate solution that only requires 2 bits, I
> think the alternate solution is justified.
> 
Inode deserves more than the 2 bits before such a solution is able to
rework mutex.

Re: [PATCH 08/11] VFS: add inode_dir_lock/unlock

Posted by NeilBrown 1 year, 1 month ago

On Mon, 23 Dec 2024, Hillf Danton wrote:
> On Mon, 23 Dec 2024 14:10:07 +1100 NeilBrown <neilb@suse.de>
> > On Sat, 21 Dec 2024, Hillf Danton wrote:
> > > Inventing anything like mutex sounds bad.
> > 
> > In general I would agree.  But when the cost of adding a mutex exceeds
> > the cost of using an alternate solution that only requires 2 bits, I
> > think the alternate solution is justified.
> > 
> Inode deserves more than the 2 bits before such a solution is able to
> rework mutex.

I'm sorry but I don't understand what you are saying.  Could you please
give more details about your concern?
Are you concerned about correctness?  Performance?  Maintainability?
Something else?

Thanks,
NeilBrown

Re: [PATCH 08/11] VFS: add inode_dir_lock/unlock

Posted by Hillf Danton 1 year, 1 month ago

On Tue, 24 Dec 2024 07:36:08 +1100 NeilBrown <neilb@suse.de>
> On Mon, 23 Dec 2024, Hillf Danton wrote:
> > On Mon, 23 Dec 2024 14:10:07 +1100 NeilBrown <neilb@suse.de>
> > > On Sat, 21 Dec 2024, Hillf Danton wrote:
> > > > Inventing anything like mutex sounds bad.
> > > 
> > > In general I would agree.  But when the cost of adding a mutex exceeds
> > > the cost of using an alternate solution that only requires 2 bits, I
> > > think the alternate solution is justified.
> > > 
> > Inode deserves more than the 2 bits before such a solution is able to
> > rework mutex.
> 
> I'm sorry but I don't understand what you are saying.  Could you please
> give more details about your concern?
> Are you concerned about correctness?  Performance?  Maintainability?
> Something else?
>
It is that you are adding a pair of honey bee wings to A380, so no takeoff
can be expected.