[PATCH RFC v2 01/23] fs: notice when init abandons fs sharing

Christian Brauner posted 23 patches 1 month ago
There is a newer version of this series
[PATCH RFC v2 01/23] fs: notice when init abandons fs sharing
Posted by Christian Brauner 1 month ago
PID 1 may choose to stop sharing fs_struct state with us. Either via
unshare(CLONE_FS) or unshare(CLONE_NEWNS). Of course, PID 1 could have
chosen to create arbitrary process trees that all share fs_struct state
via CLONE_FS. This is a strong statement: We only care about PID 1 aka
the thread-group leader so subthread's fs_struct state doesn't matter.

PID 1 unsharing fs_struct state is a bug. PID 1 relies on various
kthreads to be able to perform work based on its fs_struct state.
Breaking that contract sucks for both sides. So just don't bother with
extra work for this. No sane init system should ever do this.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs_struct.c            | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/fs_struct.h |  2 ++
 kernel/fork.c             | 14 +++-----------
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 394875d06fd6..3ff79fb894c1 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -147,6 +147,47 @@ int unshare_fs_struct(void)
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
+/*
+ * PID 1 may choose to stop sharing fs_struct state with us.
+ * Either via unshare(CLONE_FS) or unshare(CLONE_NEWNS). Of
+ * course, PID 1 could have chosen to create arbitrary process
+ * trees that all share fs_struct state via CLONE_FS. This is a
+ * strong statement: We only care about PID 1 aka the thread-group
+ * leader so subthread's fs_struct state doesn't matter.
+ *
+ * PID 1 unsharing fs_struct state is a bug. PID 1 relies on
+ * various kthreads to be able to perform work based on its
+ * fs_struct state. Breaking that contract sucks for both sides.
+ * So just don't bother with extra work for this. No sane init
+ * system should ever do this.
+ */
+static inline void nullfs_userspace_init(struct fs_struct *old_fs)
+{
+	if (likely(current->pid != 1))
+		return;
+	/* @old_fs may be dangling but for comparison it's fine */
+	if (old_fs != &init_fs)
+		return;
+	pr_warn("VFS: Pid 1 stopped sharing filesystem state\n");
+}
+
+struct fs_struct *switch_fs_struct(struct fs_struct *new_fs)
+{
+	struct fs_struct *fs;
+
+	fs = current->fs;
+	read_seqlock_excl(&fs->seq);
+	current->fs = new_fs;
+	if (--fs->users)
+		new_fs = NULL;
+	else
+		new_fs = fs;
+	read_sequnlock_excl(&fs->seq);
+
+	nullfs_userspace_init(fs);
+	return new_fs;
+}
+
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0070764b790a..ade459383f92 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -40,6 +40,8 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
 	read_sequnlock_excl(&fs->seq);
 }
 
+struct fs_struct *switch_fs_struct(struct fs_struct *new_fs);
+
 extern bool current_chrooted(void);
 
 static inline int current_umask(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..583078c69bbd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3123,7 +3123,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
  */
 int ksys_unshare(unsigned long unshare_flags)
 {
-	struct fs_struct *fs, *new_fs = NULL;
+	struct fs_struct *new_fs = NULL;
 	struct files_struct *new_fd = NULL;
 	struct cred *new_cred = NULL;
 	struct nsproxy *new_nsproxy = NULL;
@@ -3200,16 +3200,8 @@ int ksys_unshare(unsigned long unshare_flags)
 
 		task_lock(current);
 
-		if (new_fs) {
-			fs = current->fs;
-			read_seqlock_excl(&fs->seq);
-			current->fs = new_fs;
-			if (--fs->users)
-				new_fs = NULL;
-			else
-				new_fs = fs;
-			read_sequnlock_excl(&fs->seq);
-		}
+		if (new_fs)
+			new_fs = switch_fs_struct(new_fs);
 
 		if (new_fd)
 			swap(current->files, new_fd);

-- 
2.47.3
Re: [PATCH RFC v2 01/23] fs: notice when init abandons fs sharing
Posted by Christian Brauner 1 month ago
> +struct fs_struct *switch_fs_struct(struct fs_struct *new_fs)
> +{
> +	struct fs_struct *fs;
> +
> +	fs = current->fs;
> +	read_seqlock_excl(&fs->seq);
> +	current->fs = new_fs;
> +	if (--fs->users)
> +		new_fs = NULL;
> +	else
> +		new_fs = fs;
> +	read_sequnlock_excl(&fs->seq);
> +
> +	nullfs_userspace_init(fs);

This is called under task_lock() and nullfs_userspace_init() may sleep.
Oversight on my part.