fs,kthread: isolate all kthreads in nullfs

[PATCH RFC DRAFT POC 11/11] fs: isolate all kthreads in nullfs
Posted by Christian Brauner 1 month, 1 week ago
Leave all kthreads isolated in nullfs and move userspace init into its
separate fs_struct that any kthread can grab on demand to perform
lookup. This isolates kthreads from userspace filesystem state quite a
bit and makes it hard for anyone to mess up when performing filesystem
operations from kthreads. Without LOOKUP_IN_INIT they will just not be
able to do anything at all: no lookup or creation.

Add a new struct kernel_clone_args extension that allows to create a
task that shares init's filesystem state. This is only going to be used
by user_mode_thread() which execute stuff in init's filesystem state.
That concept should go away.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs_struct.c             | 49 +++++++++++++++++++++++++++++++++++++++++++---
 fs/namei.c                 |  4 ++--
 fs/namespace.c             |  4 ----
 include/linux/fs_struct.h  |  1 +
 include/linux/init_task.h  |  1 +
 include/linux/sched/task.h |  1 +
 init/main.c                | 10 +++++++++-
 kernel/fork.c              | 26 +++++++++++++++++++++---
 8 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 64b5840131cb..164139c27380 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -8,6 +8,7 @@
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include "internal.h"
+#include "mount.h"
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -160,13 +161,30 @@ EXPORT_SYMBOL_GPL(unshare_fs_struct);
  * fs_struct state. Breaking that contract sucks for both sides.
  * So just don't bother with extra work for this. No sane init
  * system should ever do this.
+ *
+ * On older kernels if PID 1 unshared its filesystem state with us the
+ * kernel simply used the stale fs_struct state implicitly pinning
+ * anything that PID 1 had last used. Even if PID 1 might've moved on to
+ * some completely different fs_struct state and might've even unmounted
+ * the old root.
+ *
+ * This has hilarious consequences: Think continuing to dump coredump
+ * state into an implicitly pinned directory somewhere. Calling random
+ * binaries in the old rootfs via usermodehelpers.
+ *
+ * Be aggressive about this: We simply reject operating on stale
+ * fs_struct state by reverting to nullfs. Every kworker that does
+ * lookups after this point will fail. Every usermodehelper call will
+ * fail. Tough luck but let's be kind and emit a warning to userspace.
  */
 static inline bool nullfs_userspace_init(void)
 {
 	struct fs_struct *fs = current->fs;
 
-	if (unlikely(current->pid == 1) && fs != &init_fs) {
+	if (unlikely(current->pid == 1) && fs != &userspace_init_fs) {
 		pr_warn("VFS: Pid 1 stopped sharing filesystem state\n");
+		set_fs_root(&userspace_init_fs, &init_fs.root);
+		set_fs_pwd(&userspace_init_fs, &init_fs.root);
 		return true;
 	}
 
@@ -186,7 +204,9 @@ struct fs_struct *switch_fs_struct(struct fs_struct *new_fs)
 		new_fs = fs;
 	read_sequnlock_excl(&fs->seq);
 
-	nullfs_userspace_init();
+	/* one reference belongs to us */
+	if (nullfs_userspace_init())
+		return NULL;
 	return new_fs;
 }
 
@@ -197,8 +217,31 @@ struct fs_struct init_fs = {
 	.umask		= 0022,
 };
 
+struct fs_struct userspace_init_fs = {
+	.users		= 1,
+	.seq		= __SEQLOCK_UNLOCKED(userspace_init_fs.seq),
+	.umask		= 0022,
+};
+
 void init_root(struct path *root)
 {
-	get_fs_root(&init_fs, root);
+	get_fs_root(&userspace_init_fs, root);
 }
 EXPORT_SYMBOL_GPL(init_root);
+
+void __init init_userspace_fs(void)
+{
+	struct mount *m;
+	struct path root;
+
+	/* Move PID 1 from nullfs into the initramfs. */
+	m = topmost_overmount(current->nsproxy->mnt_ns->root);
+	root.mnt = &m->mnt;
+	root.dentry = root.mnt->mnt_root;
+
+	VFS_WARN_ON_ONCE(current->fs != &init_fs);
+	VFS_WARN_ON_ONCE(current->pid != 1);
+	set_fs_root(&userspace_init_fs, &root);
+	set_fs_pwd(&userspace_init_fs, &root);
+	switch_fs_struct(&userspace_init_fs);
+}
diff --git a/fs/namei.c b/fs/namei.c
index 976b1e9f7032..6cc53040e9eb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1102,7 +1102,7 @@ static int set_root(struct nameidata *nd)
 	struct fs_struct *fs;
 
 	if (nd->flags & LOOKUP_IN_INIT)
-		fs = &init_fs;
+		fs = &userspace_init_fs;
 	else
 		fs = current->fs;
 
@@ -2724,7 +2724,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 		struct fs_struct *fs;
 
 		if (nd->flags & LOOKUP_IN_INIT)
-			fs = &init_fs;
+			fs = &userspace_init_fs;
 		else
 			fs = current->fs;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 854f4fc66469..10056ac1dcd2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -6190,10 +6190,6 @@ static void __init init_mount_tree(void)
 
 	init_task.nsproxy->mnt_ns = &init_mnt_ns;
 	get_mnt_ns(&init_mnt_ns);
-
-	/* The root and pwd always point to the mutable rootfs. */
-	root.mnt	= mnt;
-	root.dentry	= mnt->mnt_root;
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);
 
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 8ff1acd8389d..5c40fdc39550 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -50,5 +50,6 @@ static inline int current_umask(void)
 }
 
 void init_root(struct path *root);
+void __init init_userspace_fs(void);
 
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a6cb241ea00c..f27f88598394 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -24,6 +24,7 @@
 
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
+extern struct fs_struct userspace_init_fs;
 extern struct nsproxy init_nsproxy;
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 41ed884cffc9..e0c1ca8c6a18 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -31,6 +31,7 @@ struct kernel_clone_args {
 	u32 io_thread:1;
 	u32 user_worker:1;
 	u32 no_files:1;
+	u32 umh:1;
 	unsigned long stack;
 	unsigned long stack_size;
 	unsigned long tls;
diff --git a/init/main.c b/init/main.c
index 1cb395dd94e4..ca0d0914c63e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -102,6 +102,7 @@
 #include <linux/stackdepot.h>
 #include <linux/randomize_kstack.h>
 #include <linux/pidfs.h>
+#include <linux/fs_struct.h>
 #include <linux/ptdump.h>
 #include <linux/time_namespace.h>
 #include <linux/unaligned.h>
@@ -713,6 +714,11 @@ static __initdata DECLARE_COMPLETION(kthreadd_done);
 
 static noinline void __ref __noreturn rest_init(void)
 {
+	struct kernel_clone_args init_args = {
+		.flags		= (CLONE_FS | CLONE_VM | CLONE_UNTRACED),
+		.fn		= kernel_init,
+		.fn_arg		= NULL,
+	};
 	struct task_struct *tsk;
 	int pid;
 
@@ -722,7 +728,7 @@ static noinline void __ref __noreturn rest_init(void)
 	 * the init task will end up wanting to create kthreads, which, if
 	 * we schedule it before we create kthreadd, will OOPS.
 	 */
-	pid = user_mode_thread(kernel_init, NULL, CLONE_FS);
+	pid = kernel_clone(&init_args);
 	/*
 	 * Pin init on the boot CPU. Task migration is not properly working
 	 * until sched_init_smp() has been run. It will set the allowed
@@ -1574,6 +1580,8 @@ static int __ref kernel_init(void *unused)
 {
 	int ret;
 
+	init_userspace_fs();
+
 	/*
 	 * Wait until kthreadd is all set-up.
 	 */
diff --git a/kernel/fork.c b/kernel/fork.c
index 583078c69bbd..121538f58272 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1590,9 +1590,28 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static int copy_fs(u64 clone_flags, struct task_struct *tsk)
+static int copy_fs(u64 clone_flags, struct task_struct *tsk, bool umh)
 {
-	struct fs_struct *fs = current->fs;
+	struct fs_struct *fs;
+
+	/*
+	 * Usermodehelper may use userspace_init_fs filesystem state but
+	 * they don't get to create mount namespaces, share the
+	 * filesystem state, or be started from a non-initial mount
+	 * namespace.
+	 */
+	if (umh) {
+		if (clone_flags & (CLONE_NEWNS | CLONE_FS))
+			return -EINVAL;
+		if (current->nsproxy->mnt_ns != &init_mnt_ns)
+			return -EINVAL;
+	}
+
+	if (umh)
+		fs = &userspace_init_fs;
+	else
+		fs = current->fs;
+
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
 		read_seqlock_excl(&fs->seq);
@@ -2211,7 +2230,7 @@ __latent_entropy struct task_struct *copy_process(
 	retval = copy_files(clone_flags, p, args->no_files);
 	if (retval)
 		goto bad_fork_cleanup_semundo;
-	retval = copy_fs(clone_flags, p);
+	retval = copy_fs(clone_flags, p, args->umh);
 	if (retval)
 		goto bad_fork_cleanup_files;
 	retval = copy_sighand(clone_flags, p);
@@ -2725,6 +2744,7 @@ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
 		.exit_signal	= (flags & CSIGNAL),
 		.fn		= fn,
 		.fn_arg		= arg,
+		.umh		= 1,
 	};
 
 	return kernel_clone(&args);

-- 
2.47.3