[PATCH 4/5] kernel: Prepare set_kthread_struct to be used for setup_thread_fn

Mike Christie posted 5 patches 2 years, 7 months ago
[PATCH 4/5] kernel: Prepare set_kthread_struct to be used for setup_thread_fn
Posted by Mike Christie 2 years, 7 months ago
This preps set_kthread_struct to be used for a setup_thread_fn callback
by having it set the task's comm and also returning an int instead of a
bool.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
---
 include/linux/kthread.h |  2 +-
 kernel/fork.c           |  2 +-
 kernel/kthread.c        | 89 ++++++++++++++++++++++++++++-------------
 kernel/sched/core.c     |  2 +-
 4 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 30e5bec81d2b..94dffdfa17df 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -34,7 +34,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  const char *namefmt);
 
 void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk);
-bool set_kthread_struct(struct task_struct *p);
+int kthread_setup_struct(struct task_struct *p, void *data);
 
 void kthread_set_per_cpu(struct task_struct *k, int cpu);
 bool kthread_is_per_cpu(struct task_struct *k);
diff --git a/kernel/fork.c b/kernel/fork.c
index 057274da64fb..bd60ecc6b29c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2190,7 +2190,7 @@ static __latent_entropy struct task_struct *copy_process(
 	audit_set_context(p, NULL);
 	cgroup_fork(p);
 	if (args->kthread) {
-		if (!set_kthread_struct(p))
+		if (kthread_setup_struct(p, args->fn_arg))
 			goto bad_fork_cleanup_delayacct;
 	}
 #ifdef CONFIG_NUMA
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5e7c8f3f184f..b67c2caf2ccb 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -46,6 +46,10 @@ struct kthread_create_info
 	struct task_struct *result;
 	struct completion *done;
 
+	struct mutex name_mutex;
+	const char *name_fmt;
+	va_list *name_args;
+
 	struct list_head list;
 };
 
@@ -107,23 +111,58 @@ void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 	strscpy_pad(buf, kthread->full_name, buf_size);
 }
 
-bool set_kthread_struct(struct task_struct *p)
+static int set_kthread_comm(struct task_struct *tsk, struct kthread *kthread,
+			    struct kthread_create_info *create)
+{
+	va_list name_args;
+	int len;
+
+	mutex_lock(&create->name_mutex);
+	if (!create->name_args) {
+		mutex_unlock(&create->name_mutex);
+		return -EINTR;
+	}
+
+	va_copy(name_args, *create->name_args);
+	len = vsnprintf(tsk->comm, TASK_COMM_LEN, create->name_fmt, name_args);
+	va_end(name_args);
+	if (len >= TASK_COMM_LEN) {
+		/* leave it truncated when out of memory. */
+		kthread->full_name = kvasprintf(GFP_KERNEL, create->name_fmt,
+						*create->name_args);
+	}
+	mutex_unlock(&create->name_mutex);
+	return 0;
+}
+
+int kthread_setup_struct(struct task_struct *p, void *data)
 {
 	struct kthread *kthread;
+	int ret;
 
 	if (WARN_ON_ONCE(to_kthread(p)))
-		return false;
+		return -EINVAL;
 
 	kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
 	if (!kthread)
-		return false;
+		return -ENOMEM;
+
+	if (data) {
+		ret = set_kthread_comm(p, kthread, data);
+		if (ret)
+			goto free_kthread;
+	}
 
 	init_completion(&kthread->exited);
 	init_completion(&kthread->parked);
 	p->vfork_done = &kthread->exited;
 
 	p->worker_private = kthread;
-	return true;
+	return 0;
+
+free_kthread:
+	kfree(kthread);
+	return ret;
 }
 
 void free_kthread_struct(struct task_struct *k)
@@ -131,7 +170,7 @@ void free_kthread_struct(struct task_struct *k)
 	struct kthread *kthread;
 
 	/*
-	 * Can be NULL if kmalloc() in set_kthread_struct() failed.
+	 * Can be NULL if kmalloc() in kthread_setup_struct() failed.
 	 */
 	kthread = to_kthread(k);
 	if (!kthread)
@@ -423,6 +462,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct task_struct *task;
+	va_list name_args;
+	int ret;
 	struct kthread_create_info *create = kmalloc(sizeof(*create),
 						     GFP_KERNEL);
 
@@ -432,6 +473,10 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	create->data = data;
 	create->node = node;
 	create->done = &done;
+	mutex_init(&create->name_mutex);
+	create->name_fmt = namefmt;
+	va_copy(name_args, args);
+	create->name_args = &name_args;
 
 	spin_lock(&kthread_create_lock);
 	list_add_tail(&create->list, &kthread_create_list);
@@ -443,14 +488,20 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	 * the OOM killer while kthreadd is trying to allocate memory for
 	 * new kernel thread.
 	 */
-	if (unlikely(wait_for_completion_killable(&done))) {
+	ret = wait_for_completion_killable(&done);
+	if (unlikely(ret)) {
+		mutex_lock(&create->name_mutex);
+		create->name_args = NULL;
+		mutex_unlock(&create->name_mutex);
 		/*
 		 * If I was killed by a fatal signal before kthreadd (or new
 		 * kernel thread) calls complete(), leave the cleanup of this
 		 * structure to that thread.
 		 */
-		if (xchg(&create->done, NULL))
-			return ERR_PTR(-EINTR);
+		if (xchg(&create->done, NULL)) {
+			task = ERR_PTR(-EINTR);
+			goto end_args;
+		}
 		/*
 		 * kthreadd (or new kernel thread) will call complete()
 		 * shortly.
@@ -458,27 +509,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		wait_for_completion(&done);
 	}
 	task = create->result;
-	if (!IS_ERR(task)) {
-		char name[TASK_COMM_LEN];
-		va_list aq;
-		int len;
-
-		/*
-		 * task is already visible to other tasks, so updating
-		 * COMM must be protected.
-		 */
-		va_copy(aq, args);
-		len = vsnprintf(name, sizeof(name), namefmt, aq);
-		va_end(aq);
-		if (len >= TASK_COMM_LEN) {
-			struct kthread *kthread = to_kthread(task);
-
-			/* leave it truncated when out of memory. */
-			kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
-		}
-		set_task_comm(task, name);
-	}
 	kfree(create);
+end_args:
+	va_end(name_args);
 	return task;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e838feb6adc5..289b9941b58d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9925,7 +9925,7 @@ void __init sched_init(void)
 	 * if we want to avoid special-casing it in code that deals with per-CPU
 	 * kthreads.
 	 */
-	WARN_ON(!set_kthread_struct(current));
+	WARN_ON(kthread_setup_struct(current, NULL));
 
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
-- 
2.25.1
Re: [PATCH 4/5] kernel: Prepare set_kthread_struct to be used for setup_thread_fn
Posted by Linus Torvalds 2 years, 7 months ago
On Sun, Feb 12, 2023 at 5:00 PM Mike Christie
<michael.christie@oracle.com> wrote:
>
> This preps set_kthread_struct to be used for a setup_thread_fn callback
> by having it set the task's comm and also returning an int instead of a
> bool.

Ok, so I like the concept, but this patch is just too ugly for words,
and very very confused.

Now, some of it is pre-exisging nasty code just moved around:

> +       mutex_lock(&create->name_mutex);
> +       if (!create->name_args) {
> +               mutex_unlock(&create->name_mutex);
> +               return -EINTR;
> +       }
> +
> +       va_copy(name_args, *create->name_args);
> +       len = vsnprintf(tsk->comm, TASK_COMM_LEN, create->name_fmt, name_args);
> +       va_end(name_args);
> +       if (len >= TASK_COMM_LEN) {
> +               /* leave it truncated when out of memory. */
> +               kthread->full_name = kvasprintf(GFP_KERNEL, create->name_fmt,
> +                                               *create->name_args);
> +       }
> +       mutex_unlock(&create->name_mutex);

The *whole* point of my suggestion was to stop having silly pointless
locking on the name, because this all should be local to that one
thread creation, so that "name_mutex" kind of makes this all
pointless,

But what the heck is this:

> +       mutex_init(&create->name_mutex);
> +       create->name_fmt = namefmt;
> +       va_copy(name_args, args);
> +       create->name_args = &name_args;

That's just crazy talk.

Please just create the name once.

And please don't think that just because it was using a "va_list", you
need to keep it in that format.

Just make it create the name in __kthread_create_on_node() and be done
with it. That code already does a

        struct kthread_create_info *create = kmalloc(sizeof(*create), ..

and you can just make a sufficiently large buffer there. Don't worry
about "kthread->fuil_name" being huge, it should just be bigger than
16. Make it be 32 or something. Nobody wants a larger "full name"
anyway.

No name_mutex, no va_list that is bigger than the buffer we'd need for
the name anyway. Just "create the name once".

IOW, this patch is just being much too complicated for no good reason.
The point was to make it _simpler_ to do thread setup, not more
complicated.

        Linus
Re: [PATCH 4/5] kernel: Prepare set_kthread_struct to be used for setup_thread_fn
Posted by Mike Christie 2 years, 7 months ago
On 2/13/23 1:54 PM, Linus Torvalds wrote:
> IOW, this patch is just being much too complicated for no good reason.
> The point was to make it _simpler_ to do thread setup, not more
> complicated.

Ah ok, I think I better understand your original question/suggestion.
I was thinking we couldn't do:

diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..e94dd5a838d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2109,6 +2109,11 @@ static __latent_entropy struct task_struct *copy_process(
 		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
 	}
 
+
+	if (args->name)
+		snprintf(p->comm, TASK_COMM_LEN, "%s-%d", args->name,
+			 current->pid);
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
 	/*
 	 * Clear TID on mm_release()?

 
because it would only support vhost and io_uring's sqpoll.c case which use the
parent pid in the name. I went wild trying to support every case and also kthread's
struct setup :)

I could balance it by doing the following which would support vhost, io_uring sqpoll
and kthread's name setting:


diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 357e0068497c..81179faa943d 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -23,6 +23,7 @@ struct kernel_clone_args {
 	int __user *pidfd;
 	int __user *child_tid;
 	int __user *parent_tid;
+	const char *name;
 	int exit_signal;
 	unsigned long stack;
 	unsigned long stack_size;
@@ -89,9 +90,11 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct task_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
-struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg,
+				     const char *name, int node);
 struct task_struct *fork_idle(int);
-extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+			   unsigned long flags);
 extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
 extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
 int kernel_wait(pid_t pid, int *stat);
diff --git a/init/main.c b/init/main.c
index e1c3911d7c70..9dc816aa904f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -707,7 +707,7 @@ noinline void __ref rest_init(void)
 	rcu_read_unlock();
 
 	numa_default_policy();
-	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
+	pid = kernel_thread(kthreadd, NULL, NULL, CLONE_FS | CLONE_FILES);
 	rcu_read_lock();
 	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
 	rcu_read_unlock();
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 411bb2d1acd4..b399ef30882d 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -748,7 +748,7 @@ static void create_worker_cont(struct callback_head *cb)
 	worker = container_of(cb, struct io_worker, create_work);
 	clear_bit_unlock(0, &worker->create_state);
 	wqe = worker->wqe;
-	tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+	tsk = create_io_thread(io_wqe_worker, worker, NULL, wqe->node);
 	if (!IS_ERR(tsk)) {
 		io_init_new_worker(wqe, worker, tsk);
 		io_worker_release(worker);
@@ -817,7 +817,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	if (index == IO_WQ_ACCT_BOUND)
 		worker->flags |= IO_WORKER_F_BOUND;
 
-	tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+	tsk = create_io_thread(io_wqe_worker, worker, NULL, wqe->node);
 	if (!IS_ERR(tsk)) {
 		io_init_new_worker(wqe, worker, tsk);
 	} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 559652380672..4b0388e15671 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -223,12 +223,8 @@ static int io_sq_thread(void *data)
 	struct io_sq_data *sqd = data;
 	struct io_ring_ctx *ctx;
 	unsigned long timeout = 0;
-	char buf[TASK_COMM_LEN];
 	DEFINE_WAIT(wait);
 
-	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
-	set_task_comm(current, buf);
-
 	if (sqd->sq_cpu != -1)
 		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
 	else
@@ -350,6 +346,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
 		fdput(f);
 	}
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		char name[TASK_COMM_LEN];
 		struct task_struct *tsk;
 		struct io_sq_data *sqd;
 		bool attached;
@@ -395,7 +392,8 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
 
 		sqd->task_pid = current->pid;
 		sqd->task_tgid = current->tgid;
-		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+		snprintf(name, sizeof(name), "iou-sqp-%d", sqd->task_pid);
+		tsk = create_io_thread(io_sq_thread, sqd, name, NUMA_NO_NODE);
 		if (IS_ERR(tsk)) {
 			ret = PTR_ERR(tsk);
 			goto err_sqpoll;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..c566512281e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2109,6 +2109,9 @@ static __latent_entropy struct task_struct *copy_process(
 		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
 	}
 
+	if (args->name)
+		snprintf(p->comm, TASK_COMM_LEN, "%s", args->name);
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
 	/*
 	 * Clear TID on mm_release()?
@@ -2613,7 +2616,8 @@ struct task_struct * __init fork_idle(int cpu)
  * The returned task is inactive, and the caller must fire it up through
  * wake_up_new_task(p). All signals are blocked in the created task.
  */
-struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg,
+				     const char *name, int node)
 {
 	unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
 				CLONE_IO;
@@ -2624,6 +2628,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
 		.fn		= fn,
 		.fn_arg		= arg,
 		.io_thread	= 1,
+		.name		= name,
 	};
 
 	return copy_process(NULL, 0, node, &args);
@@ -2727,7 +2732,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 /*
  * Create a kernel thread.
  */
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+		    unsigned long flags)
 {
 	struct kernel_clone_args args = {
 		.flags		= ((lower_32_bits(flags) | CLONE_VM |
@@ -2735,6 +2741,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
 		.fn		= fn,
 		.fn_arg		= arg,
+		.name		= name,
 		.kthread	= 1,
 	};
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f97fd01a2932..20fdab8c6b25 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -38,6 +38,7 @@ struct task_struct *kthreadd_task;
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
+	char *full_name;
 	int (*threadfn)(void *data);
 	void *data;
 	int node;
@@ -343,10 +344,12 @@ static int kthread(void *_create)
 	/* Release the structure when caller killed by a fatal signal. */
 	done = xchg(&create->done, NULL);
 	if (!done) {
+		kfree(create->full_name);
 		kfree(create);
 		kthread_exit(-EINTR);
 	}
 
+	self->full_name = create->full_name;
 	self->threadfn = threadfn;
 	self->data = data;
 
@@ -396,12 +399,14 @@ static void create_kthread(struct kthread_create_info *create)
 	current->pref_node_fork = create->node;
 #endif
 	/* We want our own signal handler (we take no signals by default). */
-	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+	pid = kernel_thread(kthread, create, create->full_name,
+			    CLONE_FS | CLONE_FILES | SIGCHLD);
 	if (pid < 0) {
 		/* Release the structure when caller killed by a fatal signal. */
 		struct completion *done = xchg(&create->done, NULL);
 
 		if (!done) {
+			kfree(create->full_name);
 			kfree(create);
 			return;
 		}
@@ -427,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	create->data = data;
 	create->node = node;
 	create->done = &done;
+	create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+	if (!create->full_name) {
+		task = ERR_PTR(-ENOMEM);
+		goto free_create;
+	}
 
 	spin_lock(&kthread_create_lock);
 	list_add_tail(&create->list, &kthread_create_list);
@@ -453,26 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		wait_for_completion(&done);
 	}
 	task = create->result;
-	if (!IS_ERR(task)) {
-		char name[TASK_COMM_LEN];
-		va_list aq;
-		int len;
-
-		/*
-		 * task is already visible to other tasks, so updating
-		 * COMM must be protected.
-		 */
-		va_copy(aq, args);
-		len = vsnprintf(name, sizeof(name), namefmt, aq);
-		va_end(aq);
-		if (len >= TASK_COMM_LEN) {
-			struct kthread *kthread = to_kthread(task);
-
-			/* leave it truncated when out of memory. */
-			kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
-		}
-		set_task_comm(task, name);
-	}
+free_create:
 	kfree(create);
 	return task;
 }
Re: [PATCH 4/5] kernel: Prepare set_kthread_struct to be used for setup_thread_fn
Posted by Linus Torvalds 2 years, 6 months ago
On Mon, Feb 13, 2023 at 4:50 PM Mike Christie
<michael.christie@oracle.com> wrote:
>
> I could balance it by doing the following which would support vhost, io_uring sqpoll
> and kthread's name setting:

Yeah, so this patch looks like a real simplification to me, and avoids
the different users doing their own random task name handling.

Thanks,
           Linus