include/linux/sched.h | 7 +++++++ kernel/sched/core.c | 1 + kernel/sched/rt.c | 26 +++++++++++++++++++++++--- kernel/signal.c | 9 +++++++++ 4 files changed, 40 insertions(+), 3 deletions(-)
Hello,
I'm working on syzbot bug: rcu detected stall in validate_mm
https://syzkaller.appspot.com/bug?extid=a941018a091f1a1f9546.
I have analyzed this issue and here is what I found:
When too many signals are sent to the RT task, the overhead becomes very high.
The task cannot perform its job and as a consquenece the rt_runtime (0.95s)
is not reached even after hundreds of seconds.
This situation can be achieved by using POSIX Timers with very low interval.
The problem scenario:
1. RTLIMIT_RTPRIO is changed to non-zero value.
2. Scheduler policy is changed to SCHED_FIFO or SCHED_RR.
3. Posix Timer is created with low interval - several nanoseconds.
4. The signaling overhead becomes very high, the RT task is scheduled
but cannot reach rt_runtime (0.95s). As a consequenece there is no context
switch with non-RT task even after hundreds of seconds.
I have created a very simple solution by monitoring the number of signals
and throttling the RT task when the number of signals is greater than
or equal to 100. I am aware that this solution is very weak and cannot
be applied as a long-term, proper solution.
Could you help with finding more appropriate solution?
The patch is here:
Reported-by: syzbot+a941018a091f1a1f9546@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=a941018a091f1a1f9546
Link: https://lore.kernel.org/all/0000000000000a13ee06183e4464@google.com/T/
Signed-off-by: Radoslaw Zielonek <radoslaw.zielonek@gmail.com>
---
include/linux/sched.h | 7 +++++++
kernel/sched/core.c | 1 +
kernel/sched/rt.c | 26 +++++++++++++++++++++++---
kernel/signal.c | 9 +++++++++
4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 17cb0761ff65..123bc16ad3d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1121,6 +1121,13 @@ struct task_struct {
size_t sas_ss_size;
unsigned int sas_ss_flags;
+ /*
+ * Number of signals received by an RT task between scheduling ticks.
+ * This counter is used to throttle RT tasks when too many signals
+ * (e.g., POSIX timers) are sent to the task, which can cause an RCU stall.
+ */
+ atomic_t rt_signals_recv_count; /* used outside of the rq lock */
+
struct callback_head *task_works;
#ifdef CONFIG_AUDIT
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d44efa0d0611..9def826bd35f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4779,6 +4779,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
+ atomic_set(&p->rt_signals_recv_count, 0);
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3261b067b67e..9b22d67d1746 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -24,6 +24,15 @@ int sysctl_sched_rt_period = 1000000;
*/
int sysctl_sched_rt_runtime = 950000;
+/*
+ * To avoid an RCU stall due to a large number of signals received by RT tasks
+ * (e.g., POSIX timers), the RT task needs to be throttled.
+ * When the number of signals received by an RT task during a scheduling
+ * tick period exceeds the threshold, the RT task will be throttled.
+ * The value of 100 has not been thoroughly tested and may need adjustment.
+ */
+#define RT_RECV_SGINAL_THROTTLE_THRESHOLD 100
+
#ifdef CONFIG_SYSCTL
static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
@@ -951,7 +960,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
-static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq, int rt_signal_recv)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -966,7 +975,15 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
if (runtime == RUNTIME_INF)
return 0;
- if (rt_rq->rt_time > runtime) {
+ /*
+ * When a large number of signals are sent to this task (e.g., POSIX timers)
+ * the delta time deviates significantly from real time due to the overhead
+ * of handling signals. For RT tasks, this can cause an RCU stall.
+ * To avoid this, throttle the task when the number of signals received
+ * exceeds a certain threshold.
+ */
+ if (rt_rq->rt_time > runtime ||
+ rt_signal_recv >= RT_RECV_SGINAL_THROTTLE_THRESHOLD) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
/*
@@ -1021,7 +1038,9 @@ static void update_curr_rt(struct rq *rq)
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
- exceeded = sched_rt_runtime_exceeded(rt_rq);
+ exceeded = sched_rt_runtime_exceeded(
+ rt_rq,
+ atomic_read(&curr->rt_signals_recv_count));
if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
@@ -1029,6 +1048,7 @@ static void update_curr_rt(struct rq *rq)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
+ atomic_set(&curr->rt_signals_recv_count, 0);
}
static void
diff --git a/kernel/signal.c b/kernel/signal.c
index bdca529f0f7b..d58e0ba9336c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -629,6 +629,15 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask,
bool resched_timer = false;
int signr;
+ /*
+ * To prevent an RCU stall due to receiving too many signals by RT tasks,
+ * count all signals regardless of their type.
+ * Based on this counter, the RT scheduler will decide whether the task
+ * should be throttled or not.
+ */
+ if (tsk->policy == SCHED_FIFO || tsk->policy == SCHED_RR)
+ atomic_inc(&tsk->rt_signals_recv_count);
+
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
--
2.43.0
On Fri, Jul 05, 2024 at 09:56:23AM +0200, Radoslaw Zielonek wrote: > Hello, > > I'm working on syzbot bug: rcu detected stall in validate_mm > https://syzkaller.appspot.com/bug?extid=a941018a091f1a1f9546. > I have analyzed this issue and here is what I found: > > When too many signals are sent to the RT task, the overhead becomes very high. > The task cannot perform its job and as a consquenece the rt_runtime (0.95s) > is not reached even after hundreds of seconds. I'm having trouble parsing this. What overhead becomes high? Is the task spending time in-kernel? Because if the task is spending time in-user handling all its signals, it should accumulate runtime just fine. That is, your analysis seems to leave out / gloss over the important bit.
> I'm having trouble parsing this. What overhead becomes high? Is the task > spending time in-kernel? Because if the task is spending time in-user > handling all its signals, it should accumulate runtime just fine. The overhead is in kernel. The RT task is preempted over and over by SIGRETURN. In my case userspace set posix timer interval to 8ns. The posix_timer_fn enqueues signal (send_sigqueue). Then when the signal is dequeued (dequeue_signal) the posix timer is rearmed. Radoslaw
© 2016 - 2026 Red Hat, Inc.