include/linux/pid.h | 1 + kernel/exit.c | 53 +++++++++++++++++++++++++++++++++------------ kernel/pid.c | 23 +++++++++++++++----- 3 files changed, 58 insertions(+), 19 deletions(-)
Both add_device_randomness() and attach_pid()/detach_pid() have their
own synchronisation mechanisms.
The clone side calls them *without* the tasklist_lock held, meaning
parallel calls can contend on their locks.
The exit side calls them *with* the tasklist_lock lock, which means the
hold time is avoidably extended by waiting on either of the 2 locks, in
turn exacerbating contention on tasklist_lock itself.
Postponing the work until after the lock is dropped bumps thread
creation/destruction rate by 15% on a 24 core vm.
Bench (plop into will-it-scale):
$ cat tests/threadspawn1.c
char *testcase_description = "Thread creation and teardown";
static void *worker(void *arg)
{
return (NULL);
}
void testcase(unsigned long long *iterations, unsigned long nr)
{
pthread_t thread;
int error;
while (1) {
error = pthread_create(&thread, NULL, worker, NULL);
assert(error == 0);
error = pthread_join(thread, NULL);
assert(error == 0);
(*iterations)++;
}
}
Run:
$ ./threadspawn1_processes -t 24
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
v2:
- introduce a struct to collect work
- move out pids as well
there is more which can be pulled out
this may look suspicious:
+ proc_flush_pid(p->thread_pid);
AFAICS this is constant for the duration of the lifetime, so i don't
there is a problem
include/linux/pid.h | 1 +
kernel/exit.c | 53 +++++++++++++++++++++++++++++++++------------
kernel/pid.c | 23 +++++++++++++++-----
3 files changed, 58 insertions(+), 19 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 98837a1ff0f3..6e9fcacd02cd 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -101,6 +101,7 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
* these helpers must be called with the tasklist_lock write-held.
*/
extern void attach_pid(struct task_struct *task, enum pid_type);
+extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
extern void detach_pid(struct task_struct *task, enum pid_type);
extern void change_pid(struct task_struct *task, enum pid_type,
struct pid *pid);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1dcddfe537ee..4e452d3e3a89 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -122,14 +122,23 @@ static __init int kernel_exit_sysfs_init(void)
late_initcall(kernel_exit_sysfs_init);
#endif
-static void __unhash_process(struct task_struct *p, bool group_dead)
+/*
+ * For things release_task would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+ unsigned long long randomness;
+ struct pid *pids[PIDTYPE_MAX];
+};
+
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+ bool group_dead)
{
nr_threads--;
- detach_pid(p, PIDTYPE_PID);
+ post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
if (group_dead) {
- detach_pid(p, PIDTYPE_TGID);
- detach_pid(p, PIDTYPE_PGID);
- detach_pid(p, PIDTYPE_SID);
+ post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
+ post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
+ post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
@@ -141,7 +150,8 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
/*
* This function expects the tasklist_lock write-locked.
*/
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post,
+ struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
@@ -174,8 +184,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
- sizeof(unsigned long long));
+ post->randomness = tsk->se.sum_exec_runtime;
/*
* Accumulate here the counters for all threads as they die. We could
@@ -197,7 +206,7 @@ static void __exit_signal(struct task_struct *tsk)
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
- __unhash_process(tsk, group_dead);
+ __unhash_process(post, tsk, group_dead);
write_sequnlock(&sig->stats_lock);
/*
@@ -240,9 +249,13 @@ void __weak release_thread(struct task_struct *dead_task)
void release_task(struct task_struct *p)
{
struct task_struct *leader;
- struct pid *thread_pid;
int zap_leader;
+ struct release_task_post post;
repeat:
+ memset(&post, 0, sizeof(post));
+
+ proc_flush_pid(p->thread_pid);
+
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
@@ -253,8 +266,7 @@ void release_task(struct task_struct *p)
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- thread_pid = get_pid(p->thread_pid);
- __exit_signal(p);
+ __exit_signal(&post, p);
/*
* If we are the last non-leader member of the thread
@@ -276,8 +288,21 @@ void release_task(struct task_struct *p)
}
write_unlock_irq(&tasklist_lock);
- proc_flush_pid(thread_pid);
- put_pid(thread_pid);
+
+ /*
+ * Process clean up deferred to after we drop the tasklist lock.
+ */
+ add_device_randomness((const void*) &post.randomness,
+ sizeof(unsigned long long));
+ if (post.pids[PIDTYPE_PID])
+ free_pid(post.pids[PIDTYPE_PID]);
+ if (post.pids[PIDTYPE_TGID])
+ free_pid(post.pids[PIDTYPE_TGID]);
+ if (post.pids[PIDTYPE_PGID])
+ free_pid(post.pids[PIDTYPE_PGID]);
+ if (post.pids[PIDTYPE_SID])
+ free_pid(post.pids[PIDTYPE_SID]);
+
release_thread(p);
put_task_struct_rcu_user(p);
diff --git a/kernel/pid.c b/kernel/pid.c
index 3a10a7b6fcf8..047cdbcef5cf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -343,7 +343,7 @@ void attach_pid(struct task_struct *task, enum pid_type type)
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
-static void __change_pid(struct task_struct *task, enum pid_type type,
+static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
struct pid *new)
{
struct pid **pid_ptr = task_pid_ptr(task, type);
@@ -362,20 +362,33 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
- return;
+ return NULL;
- free_pid(pid);
+ return pid;
+}
+
+struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
+{
+ return __change_pid(task, type, NULL);
}
void detach_pid(struct task_struct *task, enum pid_type type)
{
- __change_pid(task, type, NULL);
+ struct pid *pid;
+
+ pid = detach_pid_return(task, type);
+ if (pid)
+ free_pid(pid);
}
void change_pid(struct task_struct *task, enum pid_type type,
struct pid *pid)
{
- __change_pid(task, type, pid);
+ struct pid *opid;
+
+ opid = __change_pid(task, type, pid);
+ if (opid)
+ free_pid(opid);
attach_pid(task, type);
}
--
2.43.0
Oleg asked that I take a look, and I took one. I very much agree with Oleg that this should be one patch per thing you want to effect as the issues can be intricate in this part of the code. Moving proc_flush_pid inside of tasklist_lock is a bad idea. The code has previously been several kinds of a sore spot. If you look at proc_invalidate_siblings_dcache you can see calls to d_invalidate, deactivate_super, and a few other vfs calls that could potentially do quite a lot of work and potentially take a number of locks. It has been a long time but I remember when we used to flush the proc entries under the tasklist_lock that there were actual deadlocks caused by some rare code paths that were trying to free memory to allocate memory to make progress. It is wrong that attach_pid/detach_pid can be performed without the tasklist_lock. There are reasonable guarantees provided by the posix standard that the set of processes sent a signal is the set of processes at a point in time. The tasklist_lock is how we provide those guarantees currently. There are two more layers to pids. The pid number allocation of alloc_pid/free_pid, and the struct pid layer maintained by get_pid, put_pid. Those two layers don't need the tasklist_lock. It is safe to move free_pid out of tasklist_lock. I am not certain how sane it is. Eric
On Fri, Jan 31, 2025 at 9:56 PM Eric W. Biederman <ebiederm@xmission.com> wrote: > Moving proc_flush_pid inside of tasklist_lock is a bad idea. The patch does not make such a change though. The call is still performed without the lock, but it also dodges the additional refcount dance (and notably eliminates an atomic from an area protected by tasklist_lock). > > It is wrong that attach_pid/detach_pid can be performed without the > tasklist_lock. There are reasonable guarantees provided by the posix > standard that the set of processes sent a signal is the set of > processes at a point in time. The tasklist_lock is how we provide > those guarantees currently. > I don't see anything calling these without the lock and neither my patch nor a follow up about pids suggest anything of the sort. > There are two more layers to pids. The pid number allocation of > alloc_pid/free_pid, and the struct pid layer maintained by get_pid, > put_pid. Those two layers don't need the tasklist_lock. > > > It is safe to move free_pid out of tasklist_lock. I am not certain > how sane it is. > Where is the sanity problem here? AFAICS this just delays some wakeups in the worst case. Regardless, looks like I have enough to send a v2 for further commentary. -- Mateusz Guzik <mjguzik gmail.com>
Mateusz Guzik <mjguzik@gmail.com> writes: > On Fri, Jan 31, 2025 at 9:56 PM Eric W. Biederman <ebiederm@xmission.com> wrote: >> Moving proc_flush_pid inside of tasklist_lock is a bad idea. > > The patch does not make such a change though. > > The call is still performed without the lock, but it also dodges the > additional refcount dance (and notably eliminates an atomic from an > area protected by tasklist_lock). My mistake I saw you had moved it up, but I had missed just how far. It is still a bad idea to move it early, as that has caused problems with lingering proc entries for reaped task clogging up the dcache. >> It is wrong that attach_pid/detach_pid can be performed without the >> tasklist_lock. There are reasonable guarantees provided by the posix >> standard that the set of processes sent a signal is the set of >> processes at a point in time. The tasklist_lock is how we provide >> those guarantees currently. >> > > I don't see anything calling these without the lock and neither my > patch nor a follow up about pids suggest anything of the sort. No. You simply said fork called attach_pid without the lock and your description implied it was safe to call attach_pid/detach_pid without the lock. >> There are two more layers to pids. The pid number allocation of >> alloc_pid/free_pid, and the struct pid layer maintained by get_pid, >> put_pid. Those two layers don't need the tasklist_lock. >> >> >> It is safe to move free_pid out of tasklist_lock. I am not certain >> how sane it is. >> > > Where is the sanity problem here? AFAICS this just delays some wakeups > in the worst case. At the end of the day it might be a sensible way to go. I just haven't found the arguments to convince my gut of that yet. It is important for me at least to convince my gut because it usually notices problems before the rest of me does. Eric
On Sat, Feb 1, 2025 at 12:09 AM Eric W. Biederman <ebiederm@xmission.com> wrote: > > Mateusz Guzik <mjguzik@gmail.com> writes: > > > On Fri, Jan 31, 2025 at 9:56 PM Eric W. Biederman <ebiederm@xmission.com> wrote: > >> Moving proc_flush_pid inside of tasklist_lock is a bad idea. > > > > The patch does not make such a change though. > > > > The call is still performed without the lock, but it also dodges the > > additional refcount dance (and notably eliminates an atomic from an > > area protected by tasklist_lock). > > My mistake I saw you had moved it up, but I had missed just how > far. > > It is still a bad idea to move it early, as that has caused problems > with lingering proc entries for reaped task clogging up the dcache. > I would argue the time window to find the about-to-be-whacked task is not big, but this part is not important enough for me to push for it. So I'm going to drop this bit for now. > >> It is wrong that attach_pid/detach_pid can be performed without the > >> tasklist_lock. There are reasonable guarantees provided by the posix > >> standard that the set of processes sent a signal is the set of > >> processes at a point in time. The tasklist_lock is how we provide > >> those guarantees currently. > >> > > > > I don't see anything calling these without the lock and neither my > > patch nor a follow up about pids suggest anything of the sort. > > No. You simply said fork called attach_pid without the lock and > your description implied it was safe to call attach_pid/detach_pid > without the lock. > Huh, indeed that's how it reads like. That's very poorly stated at best, my bad. The key was *allocating* a pid happens without the tasklist_lock (but with pidmap_lock) and the part which gets rid of it (detach_pid -> free_pid) operates under both. As you can see the patch keeps detach_pid inside the tasklist_lock-protected area. > >> It is safe to move free_pid out of tasklist_lock. I am not certain > >> how sane it is. > >> > > > > Where is the sanity problem here? AFAICS this just delays some wakeups > > in the worst case. > > At the end of the day it might be a sensible way to go. I just haven't > found the arguments to convince my gut of that yet. It is important for > me at least to convince my gut because it usually notices problems > before the rest of me does. > There is definitely no rush. I'm going to cook v3 if only just for fun. -- Mateusz Guzik <mjguzik gmail.com>
(Add Eric).
On 01/28, Mateusz Guzik wrote:
>
> Both add_device_randomness() and attach_pid()/detach_pid()
So afaics this patch does 2 different things, and I do think this needs
2 separate patches. Can you split this change please?
As for add_device_randomness(). I must have missed something, but I still can't
understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
to release_release_task() and avoid release_task_post->randomness.
You said:
I wanted to keep the load where it was
but why??? Again, I must have missed something, but to me this simply adds the
unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
task-to-release != current, so what is the point?
Oleg.
> have their
> own synchronisation mechanisms.
>
> The clone side calls them *without* the tasklist_lock held, meaning
> parallel calls can contend on their locks.
>
> The exit side calls them *with* the tasklist_lock lock, which means the
> hold time is avoidably extended by waiting on either of the 2 locks, in
> turn exacerbating contention on tasklist_lock itself.
>
> Postponing the work until after the lock is dropped bumps thread
> creation/destruction rate by 15% on a 24 core vm.
>
> Bench (plop into will-it-scale):
> $ cat tests/threadspawn1.c
>
> char *testcase_description = "Thread creation and teardown";
>
> static void *worker(void *arg)
> {
> return (NULL);
> }
>
> void testcase(unsigned long long *iterations, unsigned long nr)
> {
> pthread_t thread;
> int error;
>
> while (1) {
> error = pthread_create(&thread, NULL, worker, NULL);
> assert(error == 0);
> error = pthread_join(thread, NULL);
> assert(error == 0);
> (*iterations)++;
> }
> }
>
> Run:
> $ ./threadspawn1_processes -t 24
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>
> v2:
> - introduce a struct to collect work
> - move out pids as well
>
> there is more which can be pulled out
>
> this may look suspicious:
> + proc_flush_pid(p->thread_pid);
>
> AFAICS this is constant for the duration of the lifetime, so i don't
> there is a problem
>
> include/linux/pid.h | 1 +
> kernel/exit.c | 53 +++++++++++++++++++++++++++++++++------------
> kernel/pid.c | 23 +++++++++++++++-----
> 3 files changed, 58 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 98837a1ff0f3..6e9fcacd02cd 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -101,6 +101,7 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
> * these helpers must be called with the tasklist_lock write-held.
> */
> extern void attach_pid(struct task_struct *task, enum pid_type);
> +extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
> extern void detach_pid(struct task_struct *task, enum pid_type);
> extern void change_pid(struct task_struct *task, enum pid_type,
> struct pid *pid);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 1dcddfe537ee..4e452d3e3a89 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -122,14 +122,23 @@ static __init int kernel_exit_sysfs_init(void)
> late_initcall(kernel_exit_sysfs_init);
> #endif
>
> -static void __unhash_process(struct task_struct *p, bool group_dead)
> +/*
> + * For things release_task would like to do *after* tasklist_lock is released.
> + */
> +struct release_task_post {
> + unsigned long long randomness;
> + struct pid *pids[PIDTYPE_MAX];
> +};
> +
> +static void __unhash_process(struct release_task_post *post, struct task_struct *p,
> + bool group_dead)
> {
> nr_threads--;
> - detach_pid(p, PIDTYPE_PID);
> + post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
> if (group_dead) {
> - detach_pid(p, PIDTYPE_TGID);
> - detach_pid(p, PIDTYPE_PGID);
> - detach_pid(p, PIDTYPE_SID);
> + post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
> + post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
> + post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
>
> list_del_rcu(&p->tasks);
> list_del_init(&p->sibling);
> @@ -141,7 +150,8 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
> /*
> * This function expects the tasklist_lock write-locked.
> */
> -static void __exit_signal(struct task_struct *tsk)
> +static void __exit_signal(struct release_task_post *post,
> + struct task_struct *tsk)
> {
> struct signal_struct *sig = tsk->signal;
> bool group_dead = thread_group_leader(tsk);
> @@ -174,8 +184,7 @@ static void __exit_signal(struct task_struct *tsk)
> sig->curr_target = next_thread(tsk);
> }
>
> - add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
> - sizeof(unsigned long long));
> + post->randomness = tsk->se.sum_exec_runtime;
>
> /*
> * Accumulate here the counters for all threads as they die. We could
> @@ -197,7 +206,7 @@ static void __exit_signal(struct task_struct *tsk)
> task_io_accounting_add(&sig->ioac, &tsk->ioac);
> sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
> sig->nr_threads--;
> - __unhash_process(tsk, group_dead);
> + __unhash_process(post, tsk, group_dead);
> write_sequnlock(&sig->stats_lock);
>
> /*
> @@ -240,9 +249,13 @@ void __weak release_thread(struct task_struct *dead_task)
> void release_task(struct task_struct *p)
> {
> struct task_struct *leader;
> - struct pid *thread_pid;
> int zap_leader;
> + struct release_task_post post;
> repeat:
> + memset(&post, 0, sizeof(post));
> +
> + proc_flush_pid(p->thread_pid);
> +
> /* don't need to get the RCU readlock here - the process is dead and
> * can't be modifying its own credentials. But shut RCU-lockdep up */
> rcu_read_lock();
> @@ -253,8 +266,7 @@ void release_task(struct task_struct *p)
>
> write_lock_irq(&tasklist_lock);
> ptrace_release_task(p);
> - thread_pid = get_pid(p->thread_pid);
> - __exit_signal(p);
> + __exit_signal(&post, p);
>
> /*
> * If we are the last non-leader member of the thread
> @@ -276,8 +288,21 @@ void release_task(struct task_struct *p)
> }
>
> write_unlock_irq(&tasklist_lock);
> - proc_flush_pid(thread_pid);
> - put_pid(thread_pid);
> +
> + /*
> + * Process clean up deferred to after we drop the tasklist lock.
> + */
> + add_device_randomness((const void*) &post.randomness,
> + sizeof(unsigned long long));
> + if (post.pids[PIDTYPE_PID])
> + free_pid(post.pids[PIDTYPE_PID]);
> + if (post.pids[PIDTYPE_TGID])
> + free_pid(post.pids[PIDTYPE_TGID]);
> + if (post.pids[PIDTYPE_PGID])
> + free_pid(post.pids[PIDTYPE_PGID]);
> + if (post.pids[PIDTYPE_SID])
> + free_pid(post.pids[PIDTYPE_SID]);
> +
> release_thread(p);
> put_task_struct_rcu_user(p);
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3a10a7b6fcf8..047cdbcef5cf 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -343,7 +343,7 @@ void attach_pid(struct task_struct *task, enum pid_type type)
> hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
> }
>
> -static void __change_pid(struct task_struct *task, enum pid_type type,
> +static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
> struct pid *new)
> {
> struct pid **pid_ptr = task_pid_ptr(task, type);
> @@ -362,20 +362,33 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
>
> for (tmp = PIDTYPE_MAX; --tmp >= 0; )
> if (pid_has_task(pid, tmp))
> - return;
> + return NULL;
>
> - free_pid(pid);
> + return pid;
> +}
> +
> +struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
> +{
> + return __change_pid(task, type, NULL);
> }
>
> void detach_pid(struct task_struct *task, enum pid_type type)
> {
> - __change_pid(task, type, NULL);
> + struct pid *pid;
> +
> + pid = detach_pid_return(task, type);
> + if (pid)
> + free_pid(pid);
> }
>
> void change_pid(struct task_struct *task, enum pid_type type,
> struct pid *pid)
> {
> - __change_pid(task, type, pid);
> + struct pid *opid;
> +
> + opid = __change_pid(task, type, pid);
> + if (opid)
> + free_pid(opid);
> attach_pid(task, type);
> }
>
> --
> 2.43.0
>
On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote: > > (Add Eric). > > > On 01/28, Mateusz Guzik wrote: > > > > Both add_device_randomness() and attach_pid()/detach_pid() > > So afaics this patch does 2 different things, and I do think this needs > 2 separate patches. Can you split this change please? > no problem, will send a v3 provided there are no issues reported concerning the pid stuff maybe i'll add few more things pulled out to further justify the struct > As for add_device_randomness(). I must have missed something, but I still can't > understand why we can't simply shift add_device_randomness(p->sum_exec_runtime) > to release_release_task() and avoid release_task_post->randomness. > > You said: > > I wanted to keep the load where it was > > but why??? Again, I must have missed something, but to me this simply adds the > unnecessary complications. Either way, ->sum_exec_runtime is not stable even if > task-to-release != current, so what is the point? > Perhaps I should preface this is not a hill I'm going to die on. :-> This is the spot which is known to work and release_task does not access the area otherwise. So for all I know someone will change it later to be freeable, zeroed for "hardening" or some other crap and the read moved to later will quietly break to always add the same value. So by default I don't want to change aspect. However, if you insist on the read to just moving, I'll be more than happy to do that in v3. -- Mateusz Guzik <mjguzik gmail.com>
On 01/28, Mateusz Guzik wrote: > > On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote: > > > no problem, will send a v3 provided there are no issues reported > concerning the pid stuff Great, thanks. BTW, I didn't look at the pid stuff yet, I _feel_ that this can be simplified too, but I am already sleeping, most probably I am wrong. > > As for add_device_randomness(). I must have missed something, but I still can't > > understand why we can't simply shift add_device_randomness(p->sum_exec_runtime) > > to release_release_task() and avoid release_task_post->randomness. > > > > You said: > > > > I wanted to keep the load where it was > > > > but why??? Again, I must have missed something, but to me this simply adds the > > unnecessary complications. Either way, ->sum_exec_runtime is not stable even if > > task-to-release != current, so what is the point? > > > > Perhaps I should preface this is not a hill I'm going to die on. :-> > > This is the spot which is known to work and release_task does not > access the area otherwise. So for all I know someone will change it > later to be freeable, zeroed for "hardening" sum_exec_runtime can't be freed/zeroed/etc in any case. Again, please note that task-to-release can still be running, especially if it is current. > always add the same value. I don't think that "add the same value" does matter at all in this case, but please correct me. > So by default I don't want to change aspect. > > However, if you insist on the read to just moving, I'll be more than > happy to do that in v3. Thanks, see above ;) Oleg.
On Tue, Jan 28, 2025 at 8:22 PM Oleg Nesterov <oleg@redhat.com> wrote: > > On 01/28, Mateusz Guzik wrote: > > > > On Tue, Jan 28, 2025 at 7:30 PM Oleg Nesterov <oleg@redhat.com> wrote: > > > > > no problem, will send a v3 provided there are no issues reported > > concerning the pid stuff > > Great, thanks. > > BTW, I didn't look at the pid stuff yet, I _feel_ that this can be simplified > too, but I am already sleeping, most probably I am wrong. > I looked at pid code apart from the issue at hand. It the lock protecting it uses irq disablement to guard against tasklist_lock users coming from an interrupt. AFAICS this can be legally arranged so that the pidmap_lock is *never* taken while tasklist_lock is held. so far the problematic ordering only stems from free_pid calls (not only on exit), which can all be moved out. this will reduce total tasklist_lock hold time *and* whack the irq trip, speeding this up single-threaded I'll hack it up when I get around to it, maybe this week. btw, with the current patch when rolling with highly parallel thread creation/destruction it is pidmap_lock which is the main bottleneck instead of tasklist_lock -- Mateusz Guzik <mjguzik gmail.com>
© 2016 - 2026 Red Hat, Inc.