Add a monitor for checking that real-time tasks do not go to sleep in a
manner that may cause undesirable latency.
Also change
RV depends on TRACING
to
RV select TRACING
to avoid the following recursive dependency:
error: recursive dependency detected!
symbol TRACING is selected by PREEMPTIRQ_TRACEPOINTS
symbol PREEMPTIRQ_TRACEPOINTS depends on TRACE_IRQFLAGS
symbol TRACE_IRQFLAGS is selected by RV_MON_SLEEP
symbol RV_MON_SLEEP depends on RV
symbol RV depends on TRACING
Signed-off-by: Nam Cao <namcao@linutronix.de>
---
kernel/trace/rv/Kconfig | 3 +-
kernel/trace/rv/Makefile | 1 +
kernel/trace/rv/monitors/sleep/Kconfig | 13 +
kernel/trace/rv/monitors/sleep/sleep.c | 201 +++++++++++++
kernel/trace/rv/monitors/sleep/sleep.h | 287 +++++++++++++++++++
kernel/trace/rv/monitors/sleep/sleep_trace.h | 14 +
kernel/trace/rv/rv_trace.h | 1 +
tools/verification/models/rtapp/sleep.ltl | 15 +
8 files changed, 534 insertions(+), 1 deletion(-)
create mode 100644 kernel/trace/rv/monitors/sleep/Kconfig
create mode 100644 kernel/trace/rv/monitors/sleep/sleep.c
create mode 100644 kernel/trace/rv/monitors/sleep/sleep.h
create mode 100644 kernel/trace/rv/monitors/sleep/sleep_trace.h
create mode 100644 tools/verification/models/rtapp/sleep.ltl
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 6f86d8501e87..942d57575e67 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -20,7 +20,7 @@ config RV_LTL_MONITOR
menuconfig RV
bool "Runtime Verification"
- depends on TRACING
+ select TRACING
help
Enable the kernel runtime verification infrastructure. RV is a
lightweight (yet rigorous) method that complements classical
@@ -43,6 +43,7 @@ source "kernel/trace/rv/monitors/snep/Kconfig"
source "kernel/trace/rv/monitors/sncid/Kconfig"
source "kernel/trace/rv/monitors/rtapp/Kconfig"
source "kernel/trace/rv/monitors/pagefault/Kconfig"
+source "kernel/trace/rv/monitors/sleep/Kconfig"
# Add new monitors here
config RV_REACTORS
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 353ecf939d0e..13ec2944c665 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o
obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o
obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o
obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o
+obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o
# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig
new file mode 100644
index 000000000000..d00aa1aae069
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SLEEP
+ depends on RV
+ select RV_LTL_MONITOR
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ depends on RV_MON_RTAPP
+ select TRACE_IRQFLAGS
+ default y
+ select LTL_MON_EVENTS_ID
+ bool "sleep monitor"
+ help
+ Monitor that real-time tasks do not sleep in a manner that may cause undesirable latency.
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
new file mode 100644
index 000000000000..8907606a3a05
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rv.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/rt.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "sleep"
+
+#include <trace/events/syscalls.h>
+#include <trace/events/sched.h>
+#include <trace/events/lock.h>
+#include <uapi/linux/futex.h>
+#include <rv_trace.h>
+#include <monitors/rtapp/rtapp.h>
+
+#include "sleep.h"
+#include <rv/ltl_monitor.h>
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon)
+{
+ ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task));
+}
+
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation)
+{
+ /* event-like atomic propositions */
+ ltl_atom_set(mon, LTL_SLEEP, false);
+ ltl_atom_set(mon, LTL_WAKE, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false);
+
+ if (task_creation) {
+ ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP, false);
+ ltl_atom_set(mon, LTL_PI_FUTEX, false);
+ ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
+ }
+
+ if (task->flags & PF_KTHREAD) {
+ ltl_atom_set(mon, LTL_KERNEL_THREAD, true);
+
+ /* kernel tasks do not do syscall */
+ ltl_atom_set(mon, LTL_PI_FUTEX, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP, false);
+
+ if (strstarts(task->comm, "migration/"))
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
+ else
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false);
+
+ if (strstarts(task->comm, "rcu"))
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, true);
+ else
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, false);
+ } else {
+ ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
+ ltl_atom_set(mon, LTL_KERNEL_THREAD, false);
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, false);
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false);
+ }
+
+}
+
+static void handle_sched_switch(void *data, bool preempt, struct task_struct *prev,
+ struct task_struct *next, unsigned int prev_state)
+{
+ struct ltl_monitor *mon = ltl_get_monitor(next);
+
+ if (prev_state & TASK_INTERRUPTIBLE)
+ ltl_atom_pulse(prev, LTL_SLEEP, true);
+ ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
+ ltl_atom_pulse(next, LTL_WAKE, true);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *task)
+{
+ if (this_cpu_read(hardirq_context)) {
+ ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true);
+ } else if (in_task()) {
+ if (current->prio <= task->prio)
+ ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true);
+ } else if (in_nmi()) {
+ ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true);
+ }
+}
+
+static void handle_sched_pi_setprio(void *data, struct task_struct *task,
+ struct task_struct *pi_task)
+{
+ if (pi_task)
+ ltl_atom_update(pi_task, LTL_BLOCK_ON_RT_MUTEX, true);
+}
+
+static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
+{
+ struct ltl_monitor *mon;
+ unsigned long args[6];
+ int op, cmd;
+
+ mon = ltl_get_monitor(current);
+
+ switch (id) {
+ case __NR_nanosleep:
+ case __NR_clock_nanosleep:
+#ifdef __NR_clock_nanosleep_time64
+ case __NR_clock_nanosleep_time64:
+#endif
+ ltl_atom_update(current, LTL_NANOSLEEP, true);
+ break;
+
+ case __NR_futex:
+#ifdef __NR_futex_time64
+ case __NR_futex_time64:
+#endif
+ syscall_get_arguments(current, regs, args);
+ op = args[1];
+ cmd = op & FUTEX_CMD_MASK;
+
+ switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
+ case FUTEX_WAIT_REQUEUE_PI:
+ ltl_atom_update(current, LTL_PI_FUTEX, true);
+ }
+ break;
+ }
+}
+
+static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
+{
+ struct ltl_monitor *mon = ltl_get_monitor(current);
+
+ ltl_atom_set(mon, LTL_PI_FUTEX, false);
+ ltl_atom_update(current, LTL_NANOSLEEP, false);
+}
+
+static void handle_kthread_stop(void *data, struct task_struct *task)
+{
+ /* FIXME: this could race with other tracepoint handlers */
+ ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true);
+}
+
+static int enable_sleep(void)
+{
+ int retval;
+
+ retval = ltl_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
+ rv_attach_trace_probe("rtapp_sleep", sched_pi_setprio, handle_sched_pi_setprio);
+ rv_attach_trace_probe("rtapp_sleep", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter);
+ rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit);
+ rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop);
+ return 0;
+}
+
+static void disable_sleep(void)
+{
+ rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
+ rv_detach_trace_probe("rtapp_sleep", sched_pi_setprio, handle_sched_pi_setprio);
+ rv_detach_trace_probe("rtapp_sleep", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter);
+ rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit);
+ rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop);
+
+ ltl_monitor_destroy();
+}
+
+static struct rv_monitor rv_sleep = {
+ .name = "sleep",
+ .description = "Monitor that RT tasks do not undesirably sleep",
+ .enable = enable_sleep,
+ .disable = disable_sleep,
+};
+
+static int __init register_sleep(void)
+{
+ return rv_register_monitor(&rv_sleep, &rv_rtapp);
+}
+
+static void __exit unregister_sleep(void)
+{
+ rv_unregister_monitor(&rv_sleep);
+}
+
+module_init(register_sleep);
+module_exit(unregister_sleep);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep");
diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h
new file mode 100644
index 000000000000..3d2283644c9b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep.h
@@ -0,0 +1,287 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/rv.h>
+
+#define MONITOR_NAME sleep
+
+enum ltl_atom {
+ LTL_BLOCK_ON_RT_MUTEX,
+ LTL_KERNEL_THREAD,
+ LTL_KTHREAD_SHOULD_STOP,
+ LTL_NANOSLEEP,
+ LTL_PI_FUTEX,
+ LTL_RT,
+ LTL_SLEEP,
+ LTL_TASK_IS_MIGRATION,
+ LTL_TASK_IS_RCU,
+ LTL_WAKE,
+ LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ LTL_WOKEN_BY_HARDIRQ,
+ LTL_WOKEN_BY_NMI,
+ LTL_NUM_ATOM
+};
+static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM);
+
+static const char *ltl_atom_str(enum ltl_atom atom)
+{
+ static const char *const names[] = {
+ "bl_on_rt_mu",
+ "ker_th",
+ "kth_sh_st",
+ "na",
+ "pi_fu",
+ "rt",
+ "sl",
+ "ta_mi",
+ "ta_rc",
+ "wak",
+ "wo_eq_hi_pr",
+ "wo_ha",
+ "wo_nm",
+ };
+
+ return names[atom];
+}
+
+enum ltl_buchi_state {
+ S0,
+ S1,
+ S2,
+ S3,
+ S4,
+ S5,
+ S6,
+ S7,
+ S8,
+ S9,
+ S10,
+ RV_NUM_BA_STATES
+};
+static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
+
+static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool val30 = task_is_rcu || task_is_migration;
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool val4 = block_on_rt_mutex || val30;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
+ bool val24 = woken_by_nmi || kthread_should_stop;
+ bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
+ bool val25 = woken_by_hardirq || val24;
+ bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ mon->atoms);
+ bool val14 = woken_by_equal_or_higher_prio || val25;
+ bool wake = test_bit(LTL_WAKE, mon->atoms);
+ bool val13 = !wake;
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool nanosleep = test_bit(LTL_NANOSLEEP, mon->atoms);
+ bool pi_futex = test_bit(LTL_PI_FUTEX, mon->atoms);
+ bool val9 = pi_futex || nanosleep;
+ bool val11 = val9 || kernel_thread;
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool val2 = !sleep;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+
+ if (val1)
+ __set_bit(S0, mon->states);
+ if (val2)
+ __set_bit(S1, mon->states);
+ if (val11 && val13)
+ __set_bit(S2, mon->states);
+ if (val11 && val14)
+ __set_bit(S7, mon->states);
+ if (val4)
+ __set_bit(S8, mon->states);
+}
+
+static void
+ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
+{
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool val30 = task_is_rcu || task_is_migration;
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool val4 = block_on_rt_mutex || val30;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
+ bool val24 = woken_by_nmi || kthread_should_stop;
+ bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
+ bool val25 = woken_by_hardirq || val24;
+ bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ mon->atoms);
+ bool val14 = woken_by_equal_or_higher_prio || val25;
+ bool wake = test_bit(LTL_WAKE, mon->atoms);
+ bool val13 = !wake;
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool nanosleep = test_bit(LTL_NANOSLEEP, mon->atoms);
+ bool pi_futex = test_bit(LTL_PI_FUTEX, mon->atoms);
+ bool val9 = pi_futex || nanosleep;
+ bool val11 = val9 || kernel_thread;
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool val2 = !sleep;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+
+ switch (state) {
+ case S0:
+ if (val1)
+ __set_bit(S0, next);
+ if (val2)
+ __set_bit(S1, next);
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val4)
+ __set_bit(S8, next);
+ break;
+ case S1:
+ if (val1)
+ __set_bit(S0, next);
+ if (val2)
+ __set_bit(S1, next);
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val4)
+ __set_bit(S8, next);
+ break;
+ case S2:
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val1 && val13)
+ __set_bit(S3, next);
+ if (val13 && val2)
+ __set_bit(S4, next);
+ if (val13 && val4)
+ __set_bit(S5, next);
+ if (val1 && val14)
+ __set_bit(S6, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val14 && val2)
+ __set_bit(S9, next);
+ if (val14 && val4)
+ __set_bit(S10, next);
+ break;
+ case S3:
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val1 && val13)
+ __set_bit(S3, next);
+ if (val13 && val2)
+ __set_bit(S4, next);
+ if (val13 && val4)
+ __set_bit(S5, next);
+ if (val1 && val14)
+ __set_bit(S6, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val14 && val2)
+ __set_bit(S9, next);
+ if (val14 && val4)
+ __set_bit(S10, next);
+ break;
+ case S4:
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val1 && val13)
+ __set_bit(S3, next);
+ if (val13 && val2)
+ __set_bit(S4, next);
+ if (val13 && val4)
+ __set_bit(S5, next);
+ if (val1 && val14)
+ __set_bit(S6, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val14 && val2)
+ __set_bit(S9, next);
+ if (val14 && val4)
+ __set_bit(S10, next);
+ break;
+ case S5:
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val1 && val13)
+ __set_bit(S3, next);
+ if (val13 && val2)
+ __set_bit(S4, next);
+ if (val13 && val4)
+ __set_bit(S5, next);
+ if (val1 && val14)
+ __set_bit(S6, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val14 && val2)
+ __set_bit(S9, next);
+ if (val14 && val4)
+ __set_bit(S10, next);
+ break;
+ case S6:
+ if (val1)
+ __set_bit(S0, next);
+ if (val2)
+ __set_bit(S1, next);
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val4)
+ __set_bit(S8, next);
+ break;
+ case S7:
+ if (val1)
+ __set_bit(S0, next);
+ if (val2)
+ __set_bit(S1, next);
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val4)
+ __set_bit(S8, next);
+ break;
+ case S8:
+ if (val1)
+ __set_bit(S0, next);
+ if (val2)
+ __set_bit(S1, next);
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val4)
+ __set_bit(S8, next);
+ break;
+ case S9:
+ if (val1)
+ __set_bit(S0, next);
+ if (val2)
+ __set_bit(S1, next);
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val4)
+ __set_bit(S8, next);
+ break;
+ case S10:
+ if (val1)
+ __set_bit(S0, next);
+ if (val2)
+ __set_bit(S1, next);
+ if (val11 && val13)
+ __set_bit(S2, next);
+ if (val11 && val14)
+ __set_bit(S7, next);
+ if (val4)
+ __set_bit(S8, next);
+ break;
+ }
+}
diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h
new file mode 100644
index 000000000000..22eaf31da987
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SLEEP
+DEFINE_EVENT(event_ltl_monitor_id, event_sleep,
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+ TP_ARGS(task, states, atoms, next));
+DEFINE_EVENT(error_ltl_monitor_id, error_sleep,
+ TP_PROTO(struct task_struct *task),
+ TP_ARGS(task));
+#endif /* CONFIG_RV_MON_SLEEP */
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 02c906c9745b..283d5c2fd055 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -173,6 +173,7 @@ TRACE_EVENT(error_ltl_monitor_id,
TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid)
);
#include <monitors/pagefault/pagefault_trace.h>
+#include <monitors/sleep/sleep_trace.h>
// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here
#endif /* CONFIG_LTL_MON_EVENTS_ID */
#endif /* _TRACE_RV_H */
diff --git a/tools/verification/models/rtapp/sleep.ltl b/tools/verification/models/rtapp/sleep.ltl
new file mode 100644
index 000000000000..416ace2da0f2
--- /dev/null
+++ b/tools/verification/models/rtapp/sleep.ltl
@@ -0,0 +1,15 @@
+RULE = always (RT imply (SLEEP imply (RT_FRIENDLY_SLEEP or ALLOWLIST)))
+
+RT_FRIENDLY_SLEEP = (RT_VALID_SLEEP_REASON or KERNEL_THREAD)
+ and ((not WAKE) until RT_FRIENDLY_WAKE)
+
+RT_VALID_SLEEP_REASON = PI_FUTEX or NANOSLEEP
+
+RT_FRIENDLY_WAKE = WOKEN_BY_EQUAL_OR_HIGHER_PRIO
+ or WOKEN_BY_HARDIRQ
+ or WOKEN_BY_NMI
+ or KTHREAD_SHOULD_STOP
+
+ALLOWLIST = BLOCK_ON_RT_MUTEX
+ or TASK_IS_RCU
+ or TASK_IS_MIGRATION
--
2.39.5
On Wed, 2025-04-23 at 08:50 +0200, Nam Cao wrote: > Add a monitor for checking that real-time tasks do not go to sleep in > a > manner that may cause undesirable latency. > > Also change > RV depends on TRACING > to > RV select TRACING > to avoid the following recursive dependency: > > error: recursive dependency detected! > symbol TRACING is selected by PREEMPTIRQ_TRACEPOINTS > symbol PREEMPTIRQ_TRACEPOINTS depends on TRACE_IRQFLAGS > symbol TRACE_IRQFLAGS is selected by RV_MON_SLEEP > symbol RV_MON_SLEEP depends on RV > symbol RV depends on TRACING > > Signed-off-by: Nam Cao <namcao@linutronix.de> > --- > I've been playing with these monitors, code-wise they look good. I tested a bit and they seem to work without many surprises by doing something as simple as: perf stat -e rv:error_sleep stress-ng --cpu-sched 1 -t 10s -- shows several errors -- perf stat -e rv:error_sleep stress-ng --prio-inv 1 --prio-inv-policy rr -- shows only 1 error (normal while starting the program?) -- Not quite sound, but does it look a reasonable test to you? I quickly tried the same with the other monitor comparing the number of errors with the page_faults generated by perf, but that didn't make too much sense. Perhaps I'm doing something wrong here though (the number reported by perf for page faults feels a bit too high). perf stat -e page-faults -e rv:error_pagefault stress-ng --cyclic 1 Anyway, the monitor looks good to me Reviewed-by: Gabriele Monaco <gmonaco@redhat.com> but it'd be nice if you have tips to share how to quickly test it (e.g. without writing a custom workload). Thanks, Gabriele
On Thu, Apr 24, 2025 at 03:55:34PM +0200, Gabriele Monaco wrote: > I've been playing with these monitors, code-wise they look good. > I tested a bit and they seem to work without many surprises by doing > something as simple as: > > perf stat -e rv:error_sleep stress-ng --cpu-sched 1 -t 10s > -- shows several errors -- This one is a monitor's bug. The monitor mistakenly sees the task getting woken up, *then* sees it going to sleep. This is due to trace_sched_switch() being called with a stale 'prev_state'. 'prev_state' is read at the beginning of __schedule(), but trace_sched_switch() is invoked a bit later. Therefore if task->__state is changed inbetween, 'prev_state' is not the value of task->__state. The monitor checks (prev_state & TASK_INTERRUPTIBLE) to determine if the task is going to sleep. This can be incorrect due to the race above. The monitor sees the task going to sleep, but actually it is just preempted. I think this also answers the race you observed with the srs monitor? > perf stat -e rv:error_sleep stress-ng --prio-inv 1 --prio-inv-policy rr > -- shows only 1 error (normal while starting the program?) -- > > Not quite sound, but does it look a reasonable test to you? The above command use mutexes with priority inheritance. That is good for real-time. The errors are due to real-time tasks being delayed by waitpid(). Priority inheritance can be disabled with "--prio-inv-type none". Then you will see lots of errors with mutexes. > I quickly tried the same with the other monitor comparing the number of > errors with the page_faults generated by perf, but that didn't make too > much sense. Perhaps I'm doing something wrong here though (the number > reported by perf for page faults feels a bit too high). > > perf stat -e page-faults -e rv:error_pagefault stress-ng --cyclic 1 This command run a non-real-time thread to do setup, and a cyclic real-time thread. The number of pagefaults of each thread would be roughly proportional to the code size executed by each thread. As the non-real-time thread's code size is bigger, it sounds reasonable that the number of pagefaults is greater than the number of monitor's warnings. > > Anyway, the monitor looks good to me > > Reviewed-by: Gabriele Monaco <gmonaco@redhat.com> > > but it'd be nice if you have tips to share how to quickly test it (e.g. > without writing a custom workload). I tested the monitor on a real system. My system has some real-time audio processing processes (pipewire, firefox running youtube), yours also should. But thanks so much for testing with stress-ng. My testing didn't stress the system enough for the above race to happen. I will give stress-ng a few runs before the next version. Best regards, Nam
On 2025-04-25, Nam Cao <namcao@linutronix.de> wrote: > On Thu, Apr 24, 2025 at 03:55:34PM +0200, Gabriele Monaco wrote: >> I've been playing with these monitors, code-wise they look good. >> I tested a bit and they seem to work without many surprises by doing >> something as simple as: >> >> perf stat -e rv:error_sleep stress-ng --cpu-sched 1 -t 10s >> -- shows several errors -- > > This one is a monitor's bug. > > The monitor mistakenly sees the task getting woken up, *then* sees it going > to sleep. > > This is due to trace_sched_switch() being called with a stale 'prev_state'. > 'prev_state' is read at the beginning of __schedule(), but > trace_sched_switch() is invoked a bit later. Therefore if task->__state is > changed inbetween, 'prev_state' is not the value of task->__state. > > The monitor checks (prev_state & TASK_INTERRUPTIBLE) to determine if the > task is going to sleep. This can be incorrect due to the race above. The > monitor sees the task going to sleep, but actually it is just preempted. If I understand this correctly, trace_sched_switch() is reporting accurate state transition information, but by the time it is reported that state may have already changed (in which case another trace_sched_switch() occurs later). So in this example, the task did go to sleep. Why do you think it was preempted instead? John Ogness
On 2025-04-25, John Ogness <john.ogness@linutronix.de> wrote: >>> perf stat -e rv:error_sleep stress-ng --cpu-sched 1 -t 10s >>> -- shows several errors -- >> >> This one is a monitor's bug. >> >> The monitor mistakenly sees the task getting woken up, *then* sees it going >> to sleep. >> >> This is due to trace_sched_switch() being called with a stale 'prev_state'. >> 'prev_state' is read at the beginning of __schedule(), but >> trace_sched_switch() is invoked a bit later. Therefore if task->__state is >> changed inbetween, 'prev_state' is not the value of task->__state. >> >> The monitor checks (prev_state & TASK_INTERRUPTIBLE) to determine if the >> task is going to sleep. This can be incorrect due to the race above. The >> monitor sees the task going to sleep, but actually it is just preempted. > > If I understand this correctly, trace_sched_switch() is reporting > accurate state transition information, but by the time it is reported > that state may have already changed (in which case another > trace_sched_switch() occurs later). > > So in this example, the task did go to sleep. Why do you think it was > preempted instead? On 2025-04-25, Gabriele Monaco <gmonaco@redhat.com> wrote: > Peter's fix [1] landed on next recently, I guess in a couple of days > you'll get it on the upstream tree and you may not see the problem. Ah, thanks for pointing that out! > [1] - https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=8feb053d53194382fcfb68231296fdc220497ea6
On Fri, Apr 25, 2025 at 09:54:27AM +0206, John Ogness wrote:
> On 2025-04-25, John Ogness <john.ogness@linutronix.de> wrote:
> > If I understand this correctly, trace_sched_switch() is reporting
> > accurate state transition information, but by the time it is reported
> > that state may have already changed (in which case another
> > trace_sched_switch() occurs later).
> >
> > So in this example, the task did go to sleep. Why do you think it was
> > preempted instead?
You are right, the task did go to sleep. Scratch what I said earlier.
The monitor checks that if an RT task going to sleep, it will be woken by a
"RT-friendly" source. The problem is that trace_sched_switch() may appear
after trace_sched_waking(). The monitor sees the task sleeps, and waits
until the task is woken and checks the waker. But the monitor doesn't see
the task being woken, because it has already happened before the task
sleeps.
For correct ordering, we could:
- Use trace_sched_wakeup() instead of trace_sched_waking(). This would
have correct order, but information about the waker is gone at
trace_sched_wakeup(), so it doesn't work.
- Use trace_set_current_state() instead of trace_sched_switch() to
determine task going to sleep.
The latter option works, but then "sleep" would be defined as task doing
set_current_state(TASK_INTERRUPTIBLE). This new definition is probably not
precise? But for the monitor, it's fine.
Btw, while testing this, I discovered another bug. Real-time thread may
"legally" sleep by a 'restart' syscall after 'nanosleep'. The monitor
doesn't recognize this syscall as valid sleep reason and flags it :(
> On 2025-04-25, Gabriele Monaco <gmonaco@redhat.com> wrote:
> > Peter's fix [1] landed on next recently, I guess in a couple of days
> > you'll get it on the upstream tree and you may not see the problem.
This patch fixes stale prev_state due to signaling. It doesn't fix this
case.
Best regards,
Nam
2025-04-25T06:35:09Z Nam Cao <namcao@linutronix.de>: > On Thu, Apr 24, 2025 at 03:55:34PM +0200, Gabriele Monaco wrote: >> I've been playing with these monitors, code-wise they look good. >> I tested a bit and they seem to work without many surprises by doing >> something as simple as: >> >> perf stat -e rv:error_sleep stress-ng --cpu-sched 1 -t 10s >> -- shows several errors -- > > This one is a monitor's bug. > > The monitor mistakenly sees the task getting woken up, *then* sees it going > to sleep. > > This is due to trace_sched_switch() being called with a stale 'prev_state'. > 'prev_state' is read at the beginning of __schedule(), but > trace_sched_switch() is invoked a bit later. Therefore if task->__state is > changed inbetween, 'prev_state' is not the value of task->__state. > > The monitor checks (prev_state & TASK_INTERRUPTIBLE) to determine if the > task is going to sleep. This can be incorrect due to the race above. The > monitor sees the task going to sleep, but actually it is just preempted. > > I think this also answers the race you observed with the srs monitor? > Yeah that could be the culprit. Peter's fix [1] landed on next recently, I guess in a couple of days you'll get it on the upstream tree and you may not see the problem. Nevertheless, I didn't check exactly what --cpu-sched does, but I'm expecting it to do any wild thing to stress the scheduler, so it may be normal to have more errors than, say, --cyclic, which only runs nanosleeps. >> perf stat -e rv:error_sleep stress-ng --prio-inv 1 --prio-inv-policy rr >> -- shows only 1 error (normal while starting the program?) -- >> >> Not quite sound, but does it look a reasonable test to you? > > The above command use mutexes with priority inheritance. That is good for > real-time. The errors are due to real-time tasks being delayed by > waitpid(). > > Priority inheritance can be disabled with "--prio-inv-type none". Then you > will see lots of errors with mutexes. > Great, that's exactly what I wanted to know, thanks. >> I quickly tried the same with the other monitor comparing the number of >> errors with the page_faults generated by perf, but that didn't make too >> much sense. Perhaps I'm doing something wrong here though (the number >> reported by perf for page faults feels a bit too high). >> >> perf stat -e page-faults -e rv:error_pagefault stress-ng --cyclic 1 > > This command run a non-real-time thread to do setup, and a cyclic real-time > thread. The number of pagefaults of each thread would be roughly > proportional to the code size executed by each thread. As the non-real-time > thread's code size is bigger, it sounds reasonable that the number of > pagefaults is greater than the number of monitor's warnings. Mmh I guessed something like that, although numbers were a bit out of proportion (e.g. 500 page-faults and 8 errors), but again, I didn't check too carefully what happens under the hood. >> >> Anyway, the monitor looks good to me >> >> Reviewed-by: Gabriele Monaco <gmonaco@redhat.com> >> >> but it'd be nice if you have tips to share how to quickly test it (e.g. >> without writing a custom workload). > > I tested the monitor on a real system. My system has some real-time audio > processing processes (pipewire, firefox running youtube), yours also > should. That's a good point, also I didn't mention I was running these tests in a VM (virtme-ng), so the system stress is minimal and perhaps the setup triggers some different oddities (filesystems are overlays and some other things are set up differently from a real system). > > But thanks so much for testing with stress-ng. My testing didn't stress the > system enough for the above race to happen. I will give stress-ng a few > runs before the next version. > > Best regards, > Nam Thank you for the tips! Cheers, Gabriele [1] - https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=8feb053d53194382fcfb68231296fdc220497ea6
On Fri, Apr 25, 2025 at 07:35:36AM +0000, Gabriele Monaco wrote: > 2025-04-25T06:35:09Z Nam Cao <namcao@linutronix.de>: > > On Thu, Apr 24, 2025 at 03:55:34PM +0200, Gabriele Monaco wrote: > >> I quickly tried the same with the other monitor comparing the number of > >> errors with the page_faults generated by perf, but that didn't make too > >> much sense. Perhaps I'm doing something wrong here though (the number > >> reported by perf for page faults feels a bit too high). > >> > >> perf stat -e page-faults -e rv:error_pagefault stress-ng --cyclic 1 > > > > This command run a non-real-time thread to do setup, and a cyclic real-time > > thread. The number of pagefaults of each thread would be roughly > > proportional to the code size executed by each thread. As the non-real-time > > thread's code size is bigger, it sounds reasonable that the number of > > pagefaults is greater than the number of monitor's warnings. > > Mmh I guessed something like that, although numbers were a bit out of > proportion (e.g. 500 page-faults and 8 errors), but again, I didn't check > too carefully what happens under the hood. Keep in mind that the non-real-time thread is calling into glibc. While the real-time thread is a small loop doing nanosleep. > > I tested the monitor on a real system. My system has some real-time audio > > processing processes (pipewire, firefox running youtube), yours also > > should. > > That's a good point, also I didn't mention I was running these tests in a > VM (virtme-ng), so the system stress is minimal and perhaps the setup > triggers some different oddities (filesystems are overlays and some other > things are set up differently from a real system). Oddities are good, they make some corner cases appear. Evidently I need to torture the monitors much more, let's see what else shows up.. Best regards, Nam
© 2016 - 2025 Red Hat, Inc.