kernel/exit.c | 72 ++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 32 deletions(-)
This reduces single-threaded overhead as it avoids one lock+irq trip on
exit.
It also improves scalability of spawning and killing threads within one
process (just shy of 5% when doing it on 24 cores on my test jig).
Both routines are moved below kcov and kmsan exit, which should be
harmless.
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
v2:
- push the routines after kcov and kmsan
I kept them separate, imo that's cleaner and might help pull up more
work later.
If you are ok with the patch as a whole, but want cosmetic changes, I
think it will be the fastest if you just do them yourself and submit
your own version. I don't need credit, feel free to steal the bench
result.
Alternativele, should you want to bench yourself:
plop into will-it-scale/tests/threadspawn1.c:
#include <assert.h>
#include <pthread.h>
char *testcase_description = "Thread creation and teardown";
static void *worker(void *arg)
{
return (NULL);
}
void testcase(unsigned long long *iterations, unsigned long nr)
{
pthread_t thread;
int error;
while (1) {
error = pthread_create(&thread, NULL, worker, NULL);
assert(error == 0);
error = pthread_join(thread, NULL);
assert(error == 0);
(*iterations)++;
}
}
kernel/exit.c | 72 ++++++++++++++++++++++++++++-----------------------
1 file changed, 40 insertions(+), 32 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index f97a2bbc9db9..055f645b3ab1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -412,9 +412,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-static void coredump_task_exit(struct task_struct *tsk)
+static struct core_state *coredump_task_exit_prep(struct task_struct *tsk)
{
- struct core_state *core_state;
+ lockdep_assert_held(&tsk->sighand->siglock);
/*
* Serialize with any possible pending coredump.
@@ -423,33 +423,37 @@ static void coredump_task_exit(struct task_struct *tsk)
* will increment ->nr_threads for each thread in the
* group without PF_POSTCOREDUMP set.
*/
- spin_lock_irq(&tsk->sighand->siglock);
tsk->flags |= PF_POSTCOREDUMP;
- core_state = tsk->signal->core_state;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (core_state) {
- struct core_thread self;
-
- self.task = current;
- if (self.task->flags & PF_SIGNALED)
- self.next = xchg(&core_state->dumper.next, &self);
- else
- self.task = NULL;
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
+ return tsk->signal->core_state;
+}
- for (;;) {
- set_current_state(TASK_IDLE|TASK_FREEZABLE);
- if (!self.task) /* see coredump_finish() */
- break;
- schedule();
- }
- __set_current_state(TASK_RUNNING);
+static void coredump_task_exit_finish(struct task_struct *tsk,
+ struct core_state *core_state)
+{
+ struct core_thread self;
+
+ if (likely(!core_state))
+ return;
+
+ self.task = current;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_current_state(TASK_IDLE|TASK_FREEZABLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
}
+ __set_current_state(TASK_RUNNING);
}
#ifdef CONFIG_MEMCG
@@ -878,7 +882,8 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
struct sighand_struct *sighand = tsk->sighand;
struct signal_struct *signal = tsk->signal;
- spin_lock_irq(&sighand->siglock);
+ lockdep_assert_held(&sighand->siglock);
+
signal->quick_threads--;
if ((signal->quick_threads == 0) &&
!(signal->flags & SIGNAL_GROUP_EXIT)) {
@@ -886,24 +891,27 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
signal->group_exit_code = code;
signal->group_stop_count = 0;
}
- spin_unlock_irq(&sighand->siglock);
}
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
+ struct sighand_struct *sighand = tsk->sighand;
+ struct core_state *core_state;
int group_dead;
WARN_ON(irqs_disabled());
-
- synchronize_group_exit(tsk, code);
-
WARN_ON(tsk->plug);
kcov_task_exit(tsk);
kmsan_task_exit(tsk);
- coredump_task_exit(tsk);
+ spin_lock_irq(&sighand->siglock);
+ synchronize_group_exit(tsk, code);
+ core_state = coredump_task_exit_prep(tsk);
+ spin_unlock_irq(&sighand->siglock);
+
+ coredump_task_exit_finish(tsk, core_state);
ptrace_event(PTRACE_EVENT_EXIT, code);
user_events_exit(tsk);
--
2.43.0
On 03/19, Mateusz Guzik wrote:
>
> + spin_lock_irq(&sighand->siglock);
> + synchronize_group_exit(tsk, code);
> + core_state = coredump_task_exit_prep(tsk);
> + spin_unlock_irq(&sighand->siglock);
Well, but why do we need the new (and trivial) coredump_task_exit_prep?
Can't synchronize_group_exit() be
static struct core_state *synchronize_group_exit(struct task_struct *tsk, long code)
{
struct sighand_struct *sighand = tsk->sighand;
struct signal_struct *signal = tsk->signal;
struct core_state *core_state = NULL;
spin_lock_irq(&sighand->siglock);
signal->quick_threads--;
if ((signal->quick_threads == 0) &&
!(signal->flags & SIGNAL_GROUP_EXIT)) {
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = code;
signal->group_stop_count = 0;
}
/*
* Serialize with any possible pending coredump.
* We must hold siglock around checking core_state
* and setting PF_POSTCOREDUMP. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group without PF_POSTCOREDUMP set.
*/
tsk->flags |= PF_POSTCOREDUMP;
core_state = tsk->signal->core_state;
spin_unlock_irq(&sighand->siglock);
return core_state;
}
?
No need to shift spin_lock_irq(siglock) from synchronize_group_exit() to do_exit(),
no need to rename coredump_task_exit...
Oleg.
On 03/19, Oleg Nesterov wrote:
>
> On 03/19, Mateusz Guzik wrote:
> >
> > + spin_lock_irq(&sighand->siglock);
> > + synchronize_group_exit(tsk, code);
> > + core_state = coredump_task_exit_prep(tsk);
> > + spin_unlock_irq(&sighand->siglock);
>
> Well, but why do we need the new (and trivial) coredump_task_exit_prep?
>
> Can't synchronize_group_exit() be
>
> static struct core_state *synchronize_group_exit(struct task_struct *tsk, long code)
> {
> struct sighand_struct *sighand = tsk->sighand;
> struct signal_struct *signal = tsk->signal;
> struct core_state *core_state = NULL;
>
> spin_lock_irq(&sighand->siglock);
> signal->quick_threads--;
> if ((signal->quick_threads == 0) &&
> !(signal->flags & SIGNAL_GROUP_EXIT)) {
> signal->flags = SIGNAL_GROUP_EXIT;
> signal->group_exit_code = code;
> signal->group_stop_count = 0;
> }
> /*
> * Serialize with any possible pending coredump.
> * We must hold siglock around checking core_state
> * and setting PF_POSTCOREDUMP. The core-inducing thread
> * will increment ->nr_threads for each thread in the
> * group without PF_POSTCOREDUMP set.
> */
> tsk->flags |= PF_POSTCOREDUMP;
> core_state = tsk->signal->core_state;
> spin_unlock_irq(&sighand->siglock);
>
> return core_state;
> }
>
> ?
Or even better,
static void synchronize_group_exit(struct task_struct *tsk, long code)
{
struct sighand_struct *sighand = tsk->sighand;
struct signal_struct *signal = tsk->signal;
struct core_state *core_state = NULL;
spin_lock_irq(&sighand->siglock);
signal->quick_threads--;
if ((signal->quick_threads == 0) &&
!(signal->flags & SIGNAL_GROUP_EXIT)) {
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = code;
signal->group_stop_count = 0;
}
/*
* Serialize with any possible pending coredump.
* We must hold siglock around checking core_state
* and setting PF_POSTCOREDUMP. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group without PF_POSTCOREDUMP set.
*/
tsk->flags |= PF_POSTCOREDUMP;
core_state = tsk->signal->core_state;
spin_unlock_irq(&sighand->siglock);
if (core_state)
coredump_task_exit(tsk, core_state);
}
> No need to shift spin_lock_irq(siglock) from synchronize_group_exit() to do_exit(),
> no need to rename coredump_task_exit...
do_exit() is already big enough...
Oleg.
On Wed, Mar 19, 2025 at 8:10 PM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 03/19, Mateusz Guzik wrote:
> >
> > + spin_lock_irq(&sighand->siglock);
> > + synchronize_group_exit(tsk, code);
> > + core_state = coredump_task_exit_prep(tsk);
> > + spin_unlock_irq(&sighand->siglock);
>
> Well, but why do we need the new (and trivial) coredump_task_exit_prep?
>
it's not *needed*, i claim it is cleaner
but i'm not going to argue about this, i'll submit and updated patch
later this evening or tomorrow
> Can't synchronize_group_exit() be
>
> static struct core_state *synchronize_group_exit(struct task_struct *tsk, long code)
> {
> struct sighand_struct *sighand = tsk->sighand;
> struct signal_struct *signal = tsk->signal;
> struct core_state *core_state = NULL;
>
> spin_lock_irq(&sighand->siglock);
> signal->quick_threads--;
> if ((signal->quick_threads == 0) &&
> !(signal->flags & SIGNAL_GROUP_EXIT)) {
> signal->flags = SIGNAL_GROUP_EXIT;
> signal->group_exit_code = code;
> signal->group_stop_count = 0;
> }
> /*
> * Serialize with any possible pending coredump.
> * We must hold siglock around checking core_state
> * and setting PF_POSTCOREDUMP. The core-inducing thread
> * will increment ->nr_threads for each thread in the
> * group without PF_POSTCOREDUMP set.
> */
> tsk->flags |= PF_POSTCOREDUMP;
> core_state = tsk->signal->core_state;
> spin_unlock_irq(&sighand->siglock);
>
> return core_state;
> }
>
> ?
>
> No need to shift spin_lock_irq(siglock) from synchronize_group_exit() to do_exit(),
> no need to rename coredump_task_exit...
>
> Oleg.
>
--
Mateusz Guzik <mjguzik gmail.com>
© 2016 - 2025 Red Hat, Inc.