include/linux/sched/signal.h | 13 +++++++++++ kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++++ kernel/sys.c | 8 ++++++- 3 files changed, 63 insertions(+), 1 deletion(-)
Processes with many threads run the risk of causing a hard lockup if
too many threads are calling getrusage() at once. This is because a
calling thread with RUSAGE_SELF spins on the sighand lock with irq
disabled, and the critical section of getrusage scales linearly with the
size of the process. All cpus may end up spinning on the sighand lock
for a long time because another thread has the lock and is busy
iterating over 250k+ threads.
In order to mitigate this, periodically re-enable interrupts while
waiting for the sighand lock. This approach lacks the FIFO fairness of a
normal spinlock mechanism, but this effect is somewhat contained to
different threads within the same process.
-- Alternatives Considered --
In an earlier version of the above approach, we added a cond_resched()
call when disabling interrupts to prevent soft lockups. This solution
turned out not to be workable on its own since getrusage() is called
from a non-preemptible context in kernel/exit.c, but could possibly be
adapted by having an alternate version of getrusage() that can be called
from a preemptible context.
Another alternative would be to have getruage() itself release the lock
and enable interrupts periodically while iterating over large numbers of
threads. However, this would be difficult to implement correctly, and
the correctness/consistency of the data reported by getrusage() would be
questionable.
One final alternative might be to add a per-process mutex for callers of
getrusage() to acquire before acquiring the sighand lock, or to be used
as a total replacement of the sigahnd lock. We haven't fully explored
what the implications of this might be.
Signed-off-by: Dylan Hatch <dylanbhatch@google.com>
---
include/linux/sched/signal.h | 13 +++++++++++
kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++++
kernel/sys.c | 8 ++++++-
3 files changed, 63 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3499c1a8b9295..7852f16139965 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -747,6 +747,19 @@ static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
return ret;
}
+extern struct sighand_struct *__lock_task_sighand_safe(struct task_struct *task,
+ unsigned long *flags);
+
+static inline struct sighand_struct *lock_task_sighand_safe(struct task_struct *task,
+ unsigned long *flags)
+{
+ struct sighand_struct *ret;
+
+ ret = __lock_task_sighand_safe(task, flags);
+ (void)__cond_lock(&task->sighand->siglock, ret);
+ return ret;
+}
+
static inline void unlock_task_sighand(struct task_struct *task,
unsigned long *flags)
{
diff --git a/kernel/signal.c b/kernel/signal.c
index 47a7602dfe8df..6d60c73b7ab91 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1397,6 +1397,49 @@ int zap_other_threads(struct task_struct *p)
return count;
}
+struct sighand_struct *__lock_task_sighand_safe(struct task_struct *tsk,
+ unsigned long *flags)
+{
+ struct sighand_struct *sighand;
+ int n;
+ bool lock = false;
+
+again:
+ rcu_read_lock();
+ local_irq_save(*flags);
+ for (n = 0; n < 500; n++) {
+ sighand = rcu_dereference(tsk->sighand);
+ if (unlikely(sighand == NULL))
+ break;
+
+ /*
+ * The downside of this approach is we loose the fairness of
+ * FIFO waiting because the acqusition order between multiple
+ * waiting tasks is effectively random.
+ */
+ lock = spin_trylock(&sighand->siglock);
+ if (!lock) {
+ cpu_relax();
+ continue;
+ }
+
+ /* __lock_task_sighand has context explaining this check. */
+ if (likely(sighand == rcu_access_pointer(tsk->sighand)))
+ break;
+ spin_unlock(&sighand->siglock);
+ lock = false;
+ }
+ rcu_read_unlock();
+
+ /* Handle pending IRQ */
+ if (!lock && sighand) {
+ local_irq_restore(*flags);
+ goto again;
+ }
+
+ return sighand;
+}
+
struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
unsigned long *flags)
{
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d8..1b1254a3c506b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1798,7 +1798,13 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
goto out;
}
- if (!lock_task_sighand(p, &flags))
+ /*
+ * We use lock_task_sighand_safe here instead of lock_task_sighand
+ * because one process with many threads all calling getrusage may
+ * otherwise cause an NMI watchdog timeout by disabling IRQs for too
+ * long.
+ */
+ if (!lock_task_sighand_safe(p, &flags))
return;
switch (who) {
--
2.43.0.381.gb435a96ce8-goog
Heh ;)
getrusage() should not use ->siglock at all.
On my TODO list. I'll try to make a patch this week.
Oleg.
On 01/17, Dylan Hatch wrote:
>
> Processes with many threads run the risk of causing a hard lockup if
> too many threads are calling getrusage() at once. This is because a
> calling thread with RUSAGE_SELF spins on the sighand lock with irq
> disabled, and the critical section of getrusage scales linearly with the
> size of the process. All cpus may end up spinning on the sighand lock
> for a long time because another thread has the lock and is busy
> iterating over 250k+ threads.
>
> In order to mitigate this, periodically re-enable interrupts while
> waiting for the sighand lock. This approach lacks the FIFO fairness of a
> normal spinlock mechanism, but this effect is somewhat contained to
> different threads within the same process.
>
> -- Alternatives Considered --
>
> In an earlier version of the above approach, we added a cond_resched()
> call when disabling interrupts to prevent soft lockups. This solution
> turned out not to be workable on its own since getrusage() is called
> from a non-preemptible context in kernel/exit.c, but could possibly be
> adapted by having an alternate version of getrusage() that can be called
> from a preemptible context.
>
> Another alternative would be to have getruage() itself release the lock
> and enable interrupts periodically while iterating over large numbers of
> threads. However, this would be difficult to implement correctly, and
> the correctness/consistency of the data reported by getrusage() would be
> questionable.
>
> One final alternative might be to add a per-process mutex for callers of
> getrusage() to acquire before acquiring the sighand lock, or to be used
> as a total replacement of the sigahnd lock. We haven't fully explored
> what the implications of this might be.
>
> Signed-off-by: Dylan Hatch <dylanbhatch@google.com>
> ---
> include/linux/sched/signal.h | 13 +++++++++++
> kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++++
> kernel/sys.c | 8 ++++++-
> 3 files changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 3499c1a8b9295..7852f16139965 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -747,6 +747,19 @@ static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
> return ret;
> }
>
> +extern struct sighand_struct *__lock_task_sighand_safe(struct task_struct *task,
> + unsigned long *flags);
> +
> +static inline struct sighand_struct *lock_task_sighand_safe(struct task_struct *task,
> + unsigned long *flags)
> +{
> + struct sighand_struct *ret;
> +
> + ret = __lock_task_sighand_safe(task, flags);
> + (void)__cond_lock(&task->sighand->siglock, ret);
> + return ret;
> +}
> +
> static inline void unlock_task_sighand(struct task_struct *task,
> unsigned long *flags)
> {
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 47a7602dfe8df..6d60c73b7ab91 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1397,6 +1397,49 @@ int zap_other_threads(struct task_struct *p)
> return count;
> }
>
> +struct sighand_struct *__lock_task_sighand_safe(struct task_struct *tsk,
> + unsigned long *flags)
> +{
> + struct sighand_struct *sighand;
> + int n;
> + bool lock = false;
> +
> +again:
> + rcu_read_lock();
> + local_irq_save(*flags);
> + for (n = 0; n < 500; n++) {
> + sighand = rcu_dereference(tsk->sighand);
> + if (unlikely(sighand == NULL))
> + break;
> +
> + /*
> + * The downside of this approach is we loose the fairness of
> + * FIFO waiting because the acqusition order between multiple
> + * waiting tasks is effectively random.
> + */
> + lock = spin_trylock(&sighand->siglock);
> + if (!lock) {
> + cpu_relax();
> + continue;
> + }
> +
> + /* __lock_task_sighand has context explaining this check. */
> + if (likely(sighand == rcu_access_pointer(tsk->sighand)))
> + break;
> + spin_unlock(&sighand->siglock);
> + lock = false;
> + }
> + rcu_read_unlock();
> +
> + /* Handle pending IRQ */
> + if (!lock && sighand) {
> + local_irq_restore(*flags);
> + goto again;
> + }
> +
> + return sighand;
> +}
> +
> struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
> unsigned long *flags)
> {
> diff --git a/kernel/sys.c b/kernel/sys.c
> index e219fcfa112d8..1b1254a3c506b 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1798,7 +1798,13 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
> goto out;
> }
>
> - if (!lock_task_sighand(p, &flags))
> + /*
> + * We use lock_task_sighand_safe here instead of lock_task_sighand
> + * because one process with many threads all calling getrusage may
> + * otherwise cause an NMI watchdog timeout by disabling IRQs for too
> + * long.
> + */
> + if (!lock_task_sighand_safe(p, &flags))
> return;
>
> switch (who) {
> --
> 2.43.0.381.gb435a96ce8-goog
>
On 01/17, Oleg Nesterov wrote:
>
> Heh ;)
>
> getrusage() should not use ->siglock at all.
>
> On my TODO list. I'll try to make a patch this week.
something like below...
I need to re-check this change, split it into 2 patches, and write
the changelogs. Hopefully tomorrow.
Oleg.
---
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..0728744a90a6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1785,21 +1785,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
- unsigned long maxrss = 0;
+ unsigned long maxrss;
+ struct mm_struct *mm;
struct signal_struct *sig = p->signal;
+ unsigned int seq = 0;
+retry:
memset((char *)r, 0, sizeof (*r));
utime = stime = 0;
+ maxrss = 0;
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = sig->maxrss;
- goto out;
+ goto out_thread;
}
- if (!lock_task_sighand(p, &flags))
- return;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
switch (who) {
case RUSAGE_BOTH:
@@ -1819,9 +1822,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
fallthrough;
case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
r->ru_nvcsw += sig->nvcsw;
r->ru_nivcsw += sig->nivcsw;
r->ru_minflt += sig->min_flt;
@@ -1830,28 +1830,42 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
r->ru_oublock += sig->oublock;
if (maxrss < sig->maxrss)
maxrss = sig->maxrss;
+
+ rcu_read_lock();
__for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
+ rcu_read_unlock();
+
break;
default:
BUG();
}
- unlock_task_sighand(p, &flags);
-out:
- r->ru_utime = ns_to_kernel_old_timeval(utime);
- r->ru_stime = ns_to_kernel_old_timeval(stime);
+ if (need_seqretry(&sig->stats_lock, seq)) {
+ seq = 1;
+ goto retry;
+ }
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- if (who != RUSAGE_CHILDREN) {
- struct mm_struct *mm = get_task_mm(p);
+ if (who == RUSAGE_CHILDREN)
+ goto done;
- if (mm) {
- setmax_mm_hiwater_rss(&maxrss, mm);
- mmput(mm);
- }
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+
+out_thread:
+ mm = get_task_mm(p);
+ if (mm) {
+ setmax_mm_hiwater_rss(&maxrss, mm);
+ mmput(mm);
}
+
+done:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
}
SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
thread_group_cputime() does its own locking, we can safely shift
thread_group_cputime_adjusted() which does another for_each_thread loop
outside of ->siglock protected section.
This is also preparation for the next patch which changes getrusage() to
use stats_lock instead of siglock. Currently the deadlock is not possible,
if getrusage() enters the slow path and takes stats_lock, read_seqretry()
in thread_group_cputime() must always return 0, so thread_group_cputime()
will never try to take the same lock. Yet this looks more safe and better
performance-wise.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/sys.c | 34 +++++++++++++++++++---------------
1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..70ad06ad852e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1785,17 +1785,19 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
- unsigned long maxrss = 0;
+ unsigned long maxrss;
+ struct mm_struct *mm;
struct signal_struct *sig = p->signal;
- memset((char *)r, 0, sizeof (*r));
+ memset(r, 0, sizeof(*r));
utime = stime = 0;
+ maxrss = 0;
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
maxrss = sig->maxrss;
- goto out;
+ goto out_thread;
}
if (!lock_task_sighand(p, &flags))
@@ -1819,9 +1821,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
fallthrough;
case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
r->ru_nvcsw += sig->nvcsw;
r->ru_nivcsw += sig->nivcsw;
r->ru_minflt += sig->min_flt;
@@ -1839,19 +1838,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
}
unlock_task_sighand(p, &flags);
-out:
- r->ru_utime = ns_to_kernel_old_timeval(utime);
- r->ru_stime = ns_to_kernel_old_timeval(stime);
+ if (who == RUSAGE_CHILDREN)
+ goto out_children;
- if (who != RUSAGE_CHILDREN) {
- struct mm_struct *mm = get_task_mm(p);
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
- if (mm) {
- setmax_mm_hiwater_rss(&maxrss, mm);
- mmput(mm);
- }
+out_thread:
+ mm = get_task_mm(p);
+ if (mm) {
+ setmax_mm_hiwater_rss(&maxrss, mm);
+ mmput(mm);
}
+
+out_children:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
}
SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
--
2.25.1.362.g51ebf55
On Fri, Jan 19, 2024 at 6:16 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> thread_group_cputime() does its own locking, we can safely shift
> thread_group_cputime_adjusted() which does another for_each_thread loop
> outside of ->siglock protected section.
>
> This is also preparation for the next patch which changes getrusage() to
> use stats_lock instead of siglock. Currently the deadlock is not possible,
> if getrusage() enters the slow path and takes stats_lock, read_seqretry()
> in thread_group_cputime() must always return 0, so thread_group_cputime()
> will never try to take the same lock. Yet this looks more safe and better
> performance-wise.
>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> kernel/sys.c | 34 +++++++++++++++++++---------------
> 1 file changed, 19 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/sys.c b/kernel/sys.c
> index e219fcfa112d..70ad06ad852e 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1785,17 +1785,19 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
> struct task_struct *t;
> unsigned long flags;
> u64 tgutime, tgstime, utime, stime;
> - unsigned long maxrss = 0;
> + unsigned long maxrss;
> + struct mm_struct *mm;
> struct signal_struct *sig = p->signal;
>
> - memset((char *)r, 0, sizeof (*r));
> + memset(r, 0, sizeof(*r));
> utime = stime = 0;
> + maxrss = 0;
>
> if (who == RUSAGE_THREAD) {
> task_cputime_adjusted(current, &utime, &stime);
> accumulate_thread_rusage(p, r);
> maxrss = sig->maxrss;
> - goto out;
> + goto out_thread;
> }
>
> if (!lock_task_sighand(p, &flags))
> @@ -1819,9 +1821,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
> fallthrough;
>
> case RUSAGE_SELF:
> - thread_group_cputime_adjusted(p, &tgutime, &tgstime);
> - utime += tgutime;
> - stime += tgstime;
> r->ru_nvcsw += sig->nvcsw;
> r->ru_nivcsw += sig->nivcsw;
> r->ru_minflt += sig->min_flt;
> @@ -1839,19 +1838,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
> }
> unlock_task_sighand(p, &flags);
>
> -out:
> - r->ru_utime = ns_to_kernel_old_timeval(utime);
> - r->ru_stime = ns_to_kernel_old_timeval(stime);
> + if (who == RUSAGE_CHILDREN)
> + goto out_children;
>
> - if (who != RUSAGE_CHILDREN) {
> - struct mm_struct *mm = get_task_mm(p);
> + thread_group_cputime_adjusted(p, &tgutime, &tgstime);
> + utime += tgutime;
> + stime += tgstime;
>
> - if (mm) {
> - setmax_mm_hiwater_rss(&maxrss, mm);
> - mmput(mm);
> - }
> +out_thread:
> + mm = get_task_mm(p);
> + if (mm) {
> + setmax_mm_hiwater_rss(&maxrss, mm);
> + mmput(mm);
> }
> +
> +out_children:
> r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
> + r->ru_utime = ns_to_kernel_old_timeval(utime);
> + r->ru_stime = ns_to_kernel_old_timeval(stime);
> }
>
> SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
> --
> 2.25.1.362.g51ebf55
>
>
Tested-by: Dylan Hatch <dylanbhatch@google.com>
Rather than lock_task_sighand(), sig->stats_lock was specifically designed
for this type of use. This way getrusage runs lockless in the likely case.
TODO:
- Change do_task_stat() to use sig->stats_lock too, then we can
remove spin_lock_irq(siglock) in wait_task_zombie().
- Turn sig->stats_lock into seqcount_rwlock_t, this way the
readers in the slow mode won't exclude each other. See
https://lore.kernel.org/all/20230913154907.GA26210@redhat.com/
- stats_lock has to disable irqs because ->siglock can be taken
in irq context, it would be very nice to change __exit_signal()
to avoid the siglock->stats_lock dependency.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/sys.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/kernel/sys.c b/kernel/sys.c
index 70ad06ad852e..f8e543f1e38a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,9 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
unsigned long maxrss;
struct mm_struct *mm;
struct signal_struct *sig = p->signal;
+ unsigned int seq = 0;
+retry:
memset(r, 0, sizeof(*r));
utime = stime = 0;
maxrss = 0;
@@ -1800,8 +1802,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
goto out_thread;
}
- if (!lock_task_sighand(p, &flags))
- return;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
switch (who) {
case RUSAGE_BOTH:
@@ -1829,14 +1830,23 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
r->ru_oublock += sig->oublock;
if (maxrss < sig->maxrss)
maxrss = sig->maxrss;
+
+ rcu_read_lock();
__for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
+ rcu_read_unlock();
+
break;
default:
BUG();
}
- unlock_task_sighand(p, &flags);
+
+ if (need_seqretry(&sig->stats_lock, seq)) {
+ seq = 1;
+ goto retry;
+ }
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
if (who == RUSAGE_CHILDREN)
goto out_children;
--
2.25.1.362.g51ebf55
On Fri, Jan 19, 2024 at 6:16 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> Rather than lock_task_sighand(), sig->stats_lock was specifically designed
> for this type of use. This way getrusage runs lockless in the likely case.
>
> TODO:
> - Change do_task_stat() to use sig->stats_lock too, then we can
> remove spin_lock_irq(siglock) in wait_task_zombie().
>
> - Turn sig->stats_lock into seqcount_rwlock_t, this way the
> readers in the slow mode won't exclude each other. See
> https://lore.kernel.org/all/20230913154907.GA26210@redhat.com/
>
> - stats_lock has to disable irqs because ->siglock can be taken
> in irq context, it would be very nice to change __exit_signal()
> to avoid the siglock->stats_lock dependency.
>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> kernel/sys.c | 16 +++++++++++++---
> 1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 70ad06ad852e..f8e543f1e38a 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1788,7 +1788,9 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
> unsigned long maxrss;
> struct mm_struct *mm;
> struct signal_struct *sig = p->signal;
> + unsigned int seq = 0;
>
> +retry:
> memset(r, 0, sizeof(*r));
> utime = stime = 0;
> maxrss = 0;
> @@ -1800,8 +1802,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
> goto out_thread;
> }
>
> - if (!lock_task_sighand(p, &flags))
> - return;
> + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
>
> switch (who) {
> case RUSAGE_BOTH:
> @@ -1829,14 +1830,23 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
> r->ru_oublock += sig->oublock;
> if (maxrss < sig->maxrss)
> maxrss = sig->maxrss;
> +
> + rcu_read_lock();
> __for_each_thread(sig, t)
> accumulate_thread_rusage(t, r);
> + rcu_read_unlock();
> +
> break;
>
> default:
> BUG();
> }
> - unlock_task_sighand(p, &flags);
> +
> + if (need_seqretry(&sig->stats_lock, seq)) {
> + seq = 1;
> + goto retry;
> + }
> + done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
>
> if (who == RUSAGE_CHILDREN)
> goto out_children;
> --
> 2.25.1.362.g51ebf55
>
>
I applied these to a 5.10 kernel, and my repro (calling getrusage(RUSAGE_SELF)
from 200K threads) is no longer triggering a hard lockup.
Tested-by: Dylan Hatch <dylanbhatch@google.com>
On Fri, 19 Jan 2024 19:27:49 -0800 Dylan Hatch <dylanbhatch@google.com> wrote: > > I applied these to a 5.10 kernel, and my repro (calling getrusage(RUSAGE_SELF) > from 200K threads) is no longer triggering a hard lockup. Thanks, but... The changelogs don't actually describe any hard lockup. [1/2] does mention "the deadlock" but that's all the info we have. So could we please have a suitable description of the bug which these are addressing? And a Reported-by:, a Closes: and a Fixes would be great too.
On Fri, 19 Jan 2024 19:27:49 -0800 Dylan Hatch <dylanbhatch@google.com> wrote: > > I applied these to a 5.10 kernel, and my repro (calling getrusage(RUSAGE_SELF) > from 200K threads) is no longer triggering a hard lockup. Thanks, but... The changelogs don't actually describe any hard lockup. [1/2] does mention "the deadlock" but that's all the info we have. So could we please have a suitable description of the bug which these are addressing? And a Reported-by:, a Closes: and a Fixes would be great too.
On 01/20, Andrew Morton wrote:
>
> On Fri, 19 Jan 2024 19:27:49 -0800 Dylan Hatch <dylanbhatch@google.com> wrote:
>
> >
> > I applied these to a 5.10 kernel, and my repro (calling getrusage(RUSAGE_SELF)
> > from 200K threads) is no longer triggering a hard lockup.
>
> Thanks, but...
>
> The changelogs don't actually describe any hard lockup. [1/2] does
> mention "the deadlock" but that's all the info we have.
Sorry for confusion... 1/2 tries to explain that this change is not
strictly necessary for 2/2, it is safe to call thread_group_cputime()
with sig->stats_lock held for writing even if thread_group_cputime()
takes the same lock, because in this case thread_group_cputime() can't
enter the slow mode.
> So could we please have a suitable description of the bug which these are
> addressing? And a Reported-by:, a Closes: and a Fixes would be great too.
Yes sorry I forgot to add Reported-by. So I'll try to update the changelog
and add Reported-and-tested-by.
But the problem is known and old. I think do_io_accounting() had the same
problem until 1df4bd83cdfdbd0 ("do_io_accounting: use sig->stats_lock").
and do_task_stat() ...
getrusage() takes siglock and does for_each_thread() twice. If NR_THREADS
call sys_getrusage() in an endless loop on NR_CPUS, lock_task_sighand()
can trigger a hard lockup because it spins with irqs disabled waiting
for other NR_CPUS-1 which need the same siglock. So the time it spins
with irqs disabled is O(NR_CPUS * NR_THREADS).
With this patch all the threads can run lockless in parallel in the
likely case.
Dylan, do you have a better description? Can you share your repro?
although I think that something simple like
#define NT BIG_NUMBER
pthread_barrier_t barr;
void *thread(void *arg)
{
struct rusage ru;
pthread_barrier_wait(&barr);
for (;;)
getrusage(RUSAGE_SELF, &ru);
return NULL;
}
int main(void)
{
pthread_barrier_init(&barr, NULL, NT);
for (int n = 0; n < NT-1; ++n) {
pthread_t pt;
pthread_create(&pt, NULL, thread, NULL);
}
thread(NULL);
return 0;
}
should work if you have a machine with a lot of memory/cpus.
Oleg.
On Sun, Jan 21, 2024 at 4:09 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> Dylan, do you have a better description? Can you share your repro?
That description seems accurate to me.
> although I think that something simple like
>
> #define NT BIG_NUMBER
>
> pthread_barrier_t barr;
>
> void *thread(void *arg)
> {
> struct rusage ru;
>
> pthread_barrier_wait(&barr);
> for (;;)
> getrusage(RUSAGE_SELF, &ru);
> return NULL;
> }
>
> int main(void)
> {
> pthread_barrier_init(&barr, NULL, NT);
>
> for (int n = 0; n < NT-1; ++n) {
> pthread_t pt;
> pthread_create(&pt, NULL, thread, NULL);
> }
> thread(NULL);
>
> return 0;
> }
>
> should work if you have a machine with a lot of memory/cpus.
>
> Oleg.
>
Here's my repro, very similar to what you've sent:
#define _GNU_SOURCE
#include <sys/resource.h>
#include <sched.h>
#include <sys/wait.h>
#include <stdio.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <unistd.h>
int thrd_func(void *data) {
struct rusage usage;
int *complete = (void *)data;
while (!*complete);
while (1) {
getrusage(RUSAGE_SELF, &usage);
}
}
#define STACK_SIZE (1024)
int main(int argc, char **argv) {
if (argc != 2) {
printf("Usage: %s <thread count>\n", argv[0]);
exit(EXIT_SUCCESS);
}
const int cnt = atoi(argv[1]);
int pids[cnt];
int complete = 0;
printf("Starting test with %d threads...\n", cnt);
for (int i = 0; i < cnt; i++) {
char *stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE |
MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (stack == MAP_FAILED) {
perror("mmap() failed\n");
return -1;
}
pids[i] = clone(thrd_func, stack + STACK_SIZE, CLONE_THREAD
| CLONE_SIGHAND | CLONE_FS | CLONE_VM |
CLONE_FILES, (void *) &complete);
if (pids[i] == -1) {
perror("clone() failed\n");
return pids[i];
}
}
complete = 1;
printf("waiting on threads...\n");
sleep(100);
complete = 0;
printf("test finished.\n");
exit(EXIT_SUCCESS);
}
I can't remember exactly why I chose to call mmap and clone directly instead
of using pthreads... but I do know what mmap'ing in a smaller stack size
makes the repro more reliable since you can create more threads. It
seemed like around 250K threads was about enough to reliably produce
the lockup, but your mileage may vary.
© 2016 - 2025 Red Hat, Inc.