include/linux/trace_events.h | 20 ++++++++++++++++++++ include/linux/tracepoint.h | 25 ++++++++++++++++++++++--- include/trace/perf.h | 4 ++-- include/trace/trace_events.h | 21 +++++++++++++++++++-- kernel/trace/trace_events.c | 8 +------- kernel/tracepoint.c | 33 +++++++++++++++++++++++++++++++++ 6 files changed, 97 insertions(+), 14 deletions(-)
The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
to protect invocation of __DO_TRACE_CALL() means that BPF programs
attached to tracepoints are non-preemptible. This is unhelpful in
real-time systems, whose users apparently wish to use BPF while also
achieving low latencies. (Who knew?)
One option would be to use preemptible RCU, but this introduces
many opportunities for infinite recursion, which many consider to
be counterproductive, especially given the relatively small stacks
provided by the Linux kernel. These opportunities could be shut down
by sufficiently energetic duplication of code, but this sort of thing
is considered impolite in some circles.
Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
readers than those of preemptible RCU, at least on Paul E. McKenney's
laptop, where task_struct access is more expensive than access to per-CPU
variables. And SRCU-fast provides way faster readers than does SRCU,
courtesy of being able to avoid the read-side use of smp_mb(). Also,
it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
functions.
While in the area, SRCU now supports early boot call_srcu(). Therefore,
remove the checks that used to avoid such use from rcu_free_old_probes()
before this commit was applied:
e53244e2c893 ("tracepoint: Remove SRCU protection")
The current commit can be thought of as an approximate revert of that
commit, with some compensating additions of preemption disabling.
This preemption disabling uses guard(preempt_notrace)().
However, Yonghong Song points out that BPF assumes that non-sleepable
BPF programs will remain on the same CPU, which means that migration
must be disabled whenever preemption remains enabled. In addition,
non-RT kernels have performance expectations that would be violated by
allowing the BPF programs to be preempted.
Therefore, continue to disable preemption in non-RT kernels, and protect
the BPF program with both SRCU and migration disabling for RT kernels,
and even then only if preemption is not already disabled.
[ paulmck: Apply kernel test robot and Yonghong Song feedback. ]
[ paulmck: Remove trace_syscalls.h changes per Steven Rostedt. ]
Link: https://lore.kernel.org/all/20250613152218.1924093-1-bigeasy@linutronix.de/
Signed-off-by: Steve Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: <bpf@vger.kernel.org>
---
include/linux/trace_events.h | 20 ++++++++++++++++++++
include/linux/tracepoint.h | 25 ++++++++++++++++++++++---
include/trace/perf.h | 4 ++--
include/trace/trace_events.h | 21 +++++++++++++++++++--
kernel/trace/trace_events.c | 8 +-------
kernel/tracepoint.c | 33 +++++++++++++++++++++++++++++++++
6 files changed, 97 insertions(+), 14 deletions(-)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 3690221ba3d80..c38988778f525 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -222,6 +222,26 @@ static inline unsigned int tracing_gen_ctx_dec(void)
return trace_ctx;
}
+/*
+ * When PREEMPT_RT is enabled, trace events are called with disabled
+ * migration. The trace events need to know if the tracepoint disabled
+ * migration or not so that what is recorded to the ring buffer shows
+ * the state of when the trace event triggered, and not the state caused
+ * by the trace event.
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline unsigned int tracing_gen_ctx_dec_cond(void)
+{
+ unsigned int trace_ctx;
+
+ trace_ctx = tracing_gen_ctx_dec();
+ /* The migration counter starts at bit 4 */
+ return trace_ctx - (1 << 4);
+}
+#else
+# define tracing_gen_ctx_dec_cond() tracing_gen_ctx_dec()
+#endif
+
struct trace_event_file;
struct ring_buffer_event *
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 8a56f3278b1b9..0563c7d9fcb22 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -100,6 +100,25 @@ void for_each_tracepoint_in_module(struct module *mod,
}
#endif /* CONFIG_MODULES */
+/*
+ * BPF programs can attach to the tracepoint callbacks. But if the
+ * callbacks are called with preemption disabled, the BPF programs
+ * can cause quite a bit of latency. When PREEMPT_RT is enabled,
+ * instead of disabling preemption, use srcu_fast_notrace() for
+ * synchronization. As BPF programs that are attached to tracepoints
+ * expect to stay on the same CPU, also disable migration.
+ */
+#ifdef CONFIG_PREEMPT_RT
+extern struct srcu_struct tracepoint_srcu;
+# define tracepoint_sync() synchronize_srcu(&tracepoint_srcu);
+# define tracepoint_guard() \
+ guard(srcu_fast_notrace)(&tracepoint_srcu); \
+ guard(migrate)()
+#else
+# define tracepoint_sync() synchronize_rcu();
+# define tracepoint_guard() guard(preempt_notrace)()
+#endif
+
/*
* tracepoint_synchronize_unregister must be called between the last tracepoint
* probe unregistration and the end of module exit to make sure there is no
@@ -115,7 +134,7 @@ void for_each_tracepoint_in_module(struct module *mod,
static inline void tracepoint_synchronize_unregister(void)
{
synchronize_rcu_tasks_trace();
- synchronize_rcu();
+ tracepoint_sync();
}
static inline bool tracepoint_is_faultable(struct tracepoint *tp)
{
@@ -275,13 +294,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
return static_branch_unlikely(&__tracepoint_##name.key);\
}
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
__DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \
static inline void __do_trace_##name(proto) \
{ \
TRACEPOINT_CHECK(name) \
if (cond) { \
- guard(preempt_notrace)(); \
+ tracepoint_guard(); \
__DO_TRACE_CALL(name, TP_ARGS(args)); \
} \
} \
diff --git a/include/trace/perf.h b/include/trace/perf.h
index a1754b73a8f55..348ad1d9b5566 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -71,6 +71,7 @@ perf_trace_##call(void *__data, proto) \
u64 __count __attribute__((unused)); \
struct task_struct *__task __attribute__((unused)); \
\
+ guard(preempt_notrace)(); \
do_perf_trace_##call(__data, args); \
}
@@ -85,9 +86,8 @@ perf_trace_##call(void *__data, proto) \
struct task_struct *__task __attribute__((unused)); \
\
might_fault(); \
- preempt_disable_notrace(); \
+ guard(preempt_notrace)(); \
do_perf_trace_##call(__data, args); \
- preempt_enable_notrace(); \
}
/*
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 4f22136fd4656..6fb58387e9f15 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -429,6 +429,22 @@ do_trace_event_raw_event_##call(void *__data, proto) \
trace_event_buffer_commit(&fbuffer); \
}
+/*
+ * When PREEMPT_RT is enabled, the tracepoint does not disable preemption
+ * but instead disables migration. The callbacks for the trace events
+ * need to have a consistent state so that it can reflect the proper
+ * preempt_disabled counter.
+ */
+#ifdef CONFIG_PREEMPT_RT
+/* disable preemption for RT so that the counters still match */
+# define trace_event_guard() guard(preempt_notrace)()
+/* Have syscalls up the migrate disable counter to emulate non-syscalls */
+# define trace_syscall_event_guard() guard(migrate)()
+#else
+# define trace_event_guard()
+# define trace_syscall_event_guard()
+#endif
+
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
@@ -436,6 +452,7 @@ __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
static notrace void \
trace_event_raw_event_##call(void *__data, proto) \
{ \
+ trace_event_guard(); \
do_trace_event_raw_event_##call(__data, args); \
}
@@ -447,9 +464,9 @@ static notrace void \
trace_event_raw_event_##call(void *__data, proto) \
{ \
might_fault(); \
- preempt_disable_notrace(); \
+ trace_syscall_event_guard(); \
+ guard(preempt_notrace)(); \
do_trace_event_raw_event_##call(__data, args); \
- preempt_enable_notrace(); \
}
/*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b16a5a1580401..2d8fd140eaf9e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -659,13 +659,7 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
trace_event_ignore_this_pid(trace_file))
return NULL;
- /*
- * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
- * preemption (adding one to the preempt_count). Since we are
- * interested in the preempt_count at the time the tracepoint was
- * hit, we need to subtract one to offset the increment.
- */
- fbuffer->trace_ctx = tracing_gen_ctx_dec();
+ fbuffer->trace_ctx = tracing_gen_ctx_dec_cond();
fbuffer->trace_file = trace_file;
fbuffer->event =
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 62719d2941c90..6a6bcf86bfbed 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,12 @@ enum tp_func_state {
extern tracepoint_ptr_t __start___tracepoints_ptrs[];
extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
+/* In PREEMPT_RT, SRCU is used to protect the tracepoint callbacks */
+#ifdef CONFIG_PREEMPT_RT
+DEFINE_SRCU_FAST(tracepoint_srcu);
+EXPORT_SYMBOL_GPL(tracepoint_srcu);
+#endif
+
enum tp_transition_sync {
TP_TRANSITION_SYNC_1_0_1,
TP_TRANSITION_SYNC_N_2_1,
@@ -34,6 +40,7 @@ enum tp_transition_sync {
struct tp_transition_snapshot {
unsigned long rcu;
+ unsigned long srcu_gp;
bool ongoing;
};
@@ -46,6 +53,9 @@ static void tp_rcu_get_state(enum tp_transition_sync sync)
/* Keep the latest get_state snapshot. */
snapshot->rcu = get_state_synchronize_rcu();
+#ifdef CONFIG_PREEMPT_RT
+ snapshot->srcu_gp = start_poll_synchronize_srcu(&tracepoint_srcu);
+#endif
snapshot->ongoing = true;
}
@@ -56,6 +66,10 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync)
if (!snapshot->ongoing)
return;
cond_synchronize_rcu(snapshot->rcu);
+#ifdef CONFIG_PREEMPT_RT
+ if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu_gp))
+ synchronize_srcu(&tracepoint_srcu);
+#endif
snapshot->ongoing = false;
}
@@ -101,10 +115,22 @@ static inline void *allocate_probes(int count)
return p == NULL ? NULL : p->probes;
}
+#ifdef CONFIG_PREEMPT_RT
+static void srcu_free_old_probes(struct rcu_head *head)
+{
+ kfree(container_of(head, struct tp_probes, rcu));
+}
+
+static void rcu_free_old_probes(struct rcu_head *head)
+{
+ call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
+}
+#else
static void rcu_free_old_probes(struct rcu_head *head)
{
kfree(container_of(head, struct tp_probes, rcu));
}
+#endif
static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old)
{
@@ -112,6 +138,13 @@ static inline void release_probes(struct tracepoint *tp, struct tracepoint_func
struct tp_probes *tp_probes = container_of(old,
struct tp_probes, probes[0]);
+ /*
+ * Tracepoint probes are protected by either RCU or
+ * Tasks Trace RCU and also by SRCU. By calling the SRCU
+ * callback in the [Tasks Trace] RCU callback we cover
+ * both cases. So let us chain the SRCU and [Tasks Trace]
+ * RCU callbacks to wait for both grace periods.
+ */
if (tracepoint_is_faultable(tp))
call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes);
else
> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>
> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
> to protect invocation of __DO_TRACE_CALL() means that BPF programs
> attached to tracepoints are non-preemptible. This is unhelpful in
> real-time systems, whose users apparently wish to use BPF while also
> achieving low latencies. (Who knew?)
>
> One option would be to use preemptible RCU, but this introduces
> many opportunities for infinite recursion, which many consider to
> be counterproductive, especially given the relatively small stacks
> provided by the Linux kernel. These opportunities could be shut down
> by sufficiently energetic duplication of code, but this sort of thing
> is considered impolite in some circles.
>
> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
> readers than those of preemptible RCU, at least on Paul E. McKenney's
> laptop, where task_struct access is more expensive than access to per-CPU
> variables. And SRCU-fast provides way faster readers than does SRCU,
> courtesy of being able to avoid the read-side use of smp_mb(). Also,
> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
> functions.
>
> While in the area, SRCU now supports early boot call_srcu(). Therefore,
> remove the checks that used to avoid such use from rcu_free_old_probes()
> before this commit was applied:
>
> e53244e2c893 ("tracepoint: Remove SRCU protection")
>
> The current commit can be thought of as an approximate revert of that
> commit, with some compensating additions of preemption disabling.
> This preemption disabling uses guard(preempt_notrace)().
>
> However, Yonghong Song points out that BPF assumes that non-sleepable
> BPF programs will remain on the same CPU, which means that migration
> must be disabled whenever preemption remains enabled. In addition,
> non-RT kernels have performance expectations that would be violated by
> allowing the BPF programs to be preempted.
>
> Therefore, continue to disable preemption in non-RT kernels, and protect
> the BPF program with both SRCU and migration disabling for RT kernels,
> and even then only if preemption is not already disabled.
Hi Paul,
Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
thanks,
- Joel
>
> [ paulmck: Apply kernel test robot and Yonghong Song feedback. ]
> [ paulmck: Remove trace_syscalls.h changes per Steven Rostedt. ]
>
> Link: https://lore.kernel.org/all/20250613152218.1924093-1-bigeasy@linutronix.de/
> Signed-off-by: Steve Rostedt (Google) <rostedt@goodmis.org>
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> Cc: <bpf@vger.kernel.org>
>
> ---
>
> include/linux/trace_events.h | 20 ++++++++++++++++++++
> include/linux/tracepoint.h | 25 ++++++++++++++++++++++---
> include/trace/perf.h | 4 ++--
> include/trace/trace_events.h | 21 +++++++++++++++++++--
> kernel/trace/trace_events.c | 8 +-------
> kernel/tracepoint.c | 33 +++++++++++++++++++++++++++++++++
> 6 files changed, 97 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 3690221ba3d80..c38988778f525 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -222,6 +222,26 @@ static inline unsigned int tracing_gen_ctx_dec(void)
> return trace_ctx;
> }
>
> +/*
> + * When PREEMPT_RT is enabled, trace events are called with disabled
> + * migration. The trace events need to know if the tracepoint disabled
> + * migration or not so that what is recorded to the ring buffer shows
> + * the state of when the trace event triggered, and not the state caused
> + * by the trace event.
> + */
> +#ifdef CONFIG_PREEMPT_RT
> +static inline unsigned int tracing_gen_ctx_dec_cond(void)
> +{
> + unsigned int trace_ctx;
> +
> + trace_ctx = tracing_gen_ctx_dec();
> + /* The migration counter starts at bit 4 */
> + return trace_ctx - (1 << 4);
> +}
> +#else
> +# define tracing_gen_ctx_dec_cond() tracing_gen_ctx_dec()
> +#endif
> +
> struct trace_event_file;
>
> struct ring_buffer_event *
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 8a56f3278b1b9..0563c7d9fcb22 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -100,6 +100,25 @@ void for_each_tracepoint_in_module(struct module *mod,
> }
> #endif /* CONFIG_MODULES */
>
> +/*
> + * BPF programs can attach to the tracepoint callbacks. But if the
> + * callbacks are called with preemption disabled, the BPF programs
> + * can cause quite a bit of latency. When PREEMPT_RT is enabled,
> + * instead of disabling preemption, use srcu_fast_notrace() for
> + * synchronization. As BPF programs that are attached to tracepoints
> + * expect to stay on the same CPU, also disable migration.
> + */
> +#ifdef CONFIG_PREEMPT_RT
> +extern struct srcu_struct tracepoint_srcu;
> +# define tracepoint_sync() synchronize_srcu(&tracepoint_srcu);
> +# define tracepoint_guard() \
> + guard(srcu_fast_notrace)(&tracepoint_srcu); \
> + guard(migrate)()
> +#else
> +# define tracepoint_sync() synchronize_rcu();
> +# define tracepoint_guard() guard(preempt_notrace)()
> +#endif
> +
> /*
> * tracepoint_synchronize_unregister must be called between the last tracepoint
> * probe unregistration and the end of module exit to make sure there is no
> @@ -115,7 +134,7 @@ void for_each_tracepoint_in_module(struct module *mod,
> static inline void tracepoint_synchronize_unregister(void)
> {
> synchronize_rcu_tasks_trace();
> - synchronize_rcu();
> + tracepoint_sync();
> }
> static inline bool tracepoint_is_faultable(struct tracepoint *tp)
> {
> @@ -275,13 +294,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
> return static_branch_unlikely(&__tracepoint_##name.key);\
> }
>
> -#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
> +#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
> __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \
> static inline void __do_trace_##name(proto) \
> { \
> TRACEPOINT_CHECK(name) \
> if (cond) { \
> - guard(preempt_notrace)(); \
> + tracepoint_guard(); \
> __DO_TRACE_CALL(name, TP_ARGS(args)); \
> } \
> } \
> diff --git a/include/trace/perf.h b/include/trace/perf.h
> index a1754b73a8f55..348ad1d9b5566 100644
> --- a/include/trace/perf.h
> +++ b/include/trace/perf.h
> @@ -71,6 +71,7 @@ perf_trace_##call(void *__data, proto) \
> u64 __count __attribute__((unused)); \
> struct task_struct *__task __attribute__((unused)); \
> \
> + guard(preempt_notrace)(); \
> do_perf_trace_##call(__data, args); \
> }
>
> @@ -85,9 +86,8 @@ perf_trace_##call(void *__data, proto) \
> struct task_struct *__task __attribute__((unused)); \
> \
> might_fault(); \
> - preempt_disable_notrace(); \
> + guard(preempt_notrace)(); \
> do_perf_trace_##call(__data, args); \
> - preempt_enable_notrace(); \
> }
>
> /*
> diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
> index 4f22136fd4656..6fb58387e9f15 100644
> --- a/include/trace/trace_events.h
> +++ b/include/trace/trace_events.h
> @@ -429,6 +429,22 @@ do_trace_event_raw_event_##call(void *__data, proto) \
> trace_event_buffer_commit(&fbuffer); \
> }
>
> +/*
> + * When PREEMPT_RT is enabled, the tracepoint does not disable preemption
> + * but instead disables migration. The callbacks for the trace events
> + * need to have a consistent state so that it can reflect the proper
> + * preempt_disabled counter.
> + */
> +#ifdef CONFIG_PREEMPT_RT
> +/* disable preemption for RT so that the counters still match */
> +# define trace_event_guard() guard(preempt_notrace)()
> +/* Have syscalls up the migrate disable counter to emulate non-syscalls */
> +# define trace_syscall_event_guard() guard(migrate)()
> +#else
> +# define trace_event_guard()
> +# define trace_syscall_event_guard()
> +#endif
> +
> #undef DECLARE_EVENT_CLASS
> #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
> __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
> @@ -436,6 +452,7 @@ __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
> static notrace void \
> trace_event_raw_event_##call(void *__data, proto) \
> { \
> + trace_event_guard(); \
> do_trace_event_raw_event_##call(__data, args); \
> }
>
> @@ -447,9 +464,9 @@ static notrace void \
> trace_event_raw_event_##call(void *__data, proto) \
> { \
> might_fault(); \
> - preempt_disable_notrace(); \
> + trace_syscall_event_guard(); \
> + guard(preempt_notrace)(); \
> do_trace_event_raw_event_##call(__data, args); \
> - preempt_enable_notrace(); \
> }
>
> /*
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index b16a5a1580401..2d8fd140eaf9e 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -659,13 +659,7 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
> trace_event_ignore_this_pid(trace_file))
> return NULL;
>
> - /*
> - * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
> - * preemption (adding one to the preempt_count). Since we are
> - * interested in the preempt_count at the time the tracepoint was
> - * hit, we need to subtract one to offset the increment.
> - */
> - fbuffer->trace_ctx = tracing_gen_ctx_dec();
> + fbuffer->trace_ctx = tracing_gen_ctx_dec_cond();
> fbuffer->trace_file = trace_file;
>
> fbuffer->event =
> diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
> index 62719d2941c90..6a6bcf86bfbed 100644
> --- a/kernel/tracepoint.c
> +++ b/kernel/tracepoint.c
> @@ -25,6 +25,12 @@ enum tp_func_state {
> extern tracepoint_ptr_t __start___tracepoints_ptrs[];
> extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
>
> +/* In PREEMPT_RT, SRCU is used to protect the tracepoint callbacks */
> +#ifdef CONFIG_PREEMPT_RT
> +DEFINE_SRCU_FAST(tracepoint_srcu);
> +EXPORT_SYMBOL_GPL(tracepoint_srcu);
> +#endif
> +
> enum tp_transition_sync {
> TP_TRANSITION_SYNC_1_0_1,
> TP_TRANSITION_SYNC_N_2_1,
> @@ -34,6 +40,7 @@ enum tp_transition_sync {
>
> struct tp_transition_snapshot {
> unsigned long rcu;
> + unsigned long srcu_gp;
> bool ongoing;
> };
>
> @@ -46,6 +53,9 @@ static void tp_rcu_get_state(enum tp_transition_sync sync)
>
> /* Keep the latest get_state snapshot. */
> snapshot->rcu = get_state_synchronize_rcu();
> +#ifdef CONFIG_PREEMPT_RT
> + snapshot->srcu_gp = start_poll_synchronize_srcu(&tracepoint_srcu);
> +#endif
> snapshot->ongoing = true;
> }
>
> @@ -56,6 +66,10 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync)
> if (!snapshot->ongoing)
> return;
> cond_synchronize_rcu(snapshot->rcu);
> +#ifdef CONFIG_PREEMPT_RT
> + if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu_gp))
> + synchronize_srcu(&tracepoint_srcu);
> +#endif
> snapshot->ongoing = false;
> }
>
> @@ -101,10 +115,22 @@ static inline void *allocate_probes(int count)
> return p == NULL ? NULL : p->probes;
> }
>
> +#ifdef CONFIG_PREEMPT_RT
> +static void srcu_free_old_probes(struct rcu_head *head)
> +{
> + kfree(container_of(head, struct tp_probes, rcu));
> +}
> +
> +static void rcu_free_old_probes(struct rcu_head *head)
> +{
> + call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
> +}
> +#else
> static void rcu_free_old_probes(struct rcu_head *head)
> {
> kfree(container_of(head, struct tp_probes, rcu));
> }
> +#endif
>
> static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old)
> {
> @@ -112,6 +138,13 @@ static inline void release_probes(struct tracepoint *tp, struct tracepoint_func
> struct tp_probes *tp_probes = container_of(old,
> struct tp_probes, probes[0]);
>
> + /*
> + * Tracepoint probes are protected by either RCU or
> + * Tasks Trace RCU and also by SRCU. By calling the SRCU
> + * callback in the [Tasks Trace] RCU callback we cover
> + * both cases. So let us chain the SRCU and [Tasks Trace]
> + * RCU callbacks to wait for both grace periods.
> + */
> if (tracepoint_is_faultable(tp))
> call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes);
> else
On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
>
>
> > On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
> > to protect invocation of __DO_TRACE_CALL() means that BPF programs
> > attached to tracepoints are non-preemptible. This is unhelpful in
> > real-time systems, whose users apparently wish to use BPF while also
> > achieving low latencies. (Who knew?)
> >
> > One option would be to use preemptible RCU, but this introduces
> > many opportunities for infinite recursion, which many consider to
> > be counterproductive, especially given the relatively small stacks
> > provided by the Linux kernel. These opportunities could be shut down
> > by sufficiently energetic duplication of code, but this sort of thing
> > is considered impolite in some circles.
> >
> > Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
> > readers than those of preemptible RCU, at least on Paul E. McKenney's
> > laptop, where task_struct access is more expensive than access to per-CPU
> > variables. And SRCU-fast provides way faster readers than does SRCU,
> > courtesy of being able to avoid the read-side use of smp_mb(). Also,
> > it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
> > functions.
> >
> > While in the area, SRCU now supports early boot call_srcu(). Therefore,
> > remove the checks that used to avoid such use from rcu_free_old_probes()
> > before this commit was applied:
> >
> > e53244e2c893 ("tracepoint: Remove SRCU protection")
> >
> > The current commit can be thought of as an approximate revert of that
> > commit, with some compensating additions of preemption disabling.
> > This preemption disabling uses guard(preempt_notrace)().
> >
> > However, Yonghong Song points out that BPF assumes that non-sleepable
> > BPF programs will remain on the same CPU, which means that migration
> > must be disabled whenever preemption remains enabled. In addition,
> > non-RT kernels have performance expectations that would be violated by
> > allowing the BPF programs to be preempted.
> >
> > Therefore, continue to disable preemption in non-RT kernels, and protect
> > the BPF program with both SRCU and migration disabling for RT kernels,
> > and even then only if preemption is not already disabled.
>
> Hi Paul,
>
> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
Because in some cases the non-RT benefit is suspected to be negative
due to increasing the probability of preemption in awkward places.
> Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
Thank you, and I will let Steven collect this one.
Thanx, Paul
> thanks,
>
> - Joel
>
>
>
>
> >
> > [ paulmck: Apply kernel test robot and Yonghong Song feedback. ]
> > [ paulmck: Remove trace_syscalls.h changes per Steven Rostedt. ]
> >
> > Link: https://lore.kernel.org/all/20250613152218.1924093-1-bigeasy@linutronix.de/
> > Signed-off-by: Steve Rostedt (Google) <rostedt@goodmis.org>
> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > Cc: <bpf@vger.kernel.org>
> >
> > ---
> >
> > include/linux/trace_events.h | 20 ++++++++++++++++++++
> > include/linux/tracepoint.h | 25 ++++++++++++++++++++++---
> > include/trace/perf.h | 4 ++--
> > include/trace/trace_events.h | 21 +++++++++++++++++++--
> > kernel/trace/trace_events.c | 8 +-------
> > kernel/tracepoint.c | 33 +++++++++++++++++++++++++++++++++
> > 6 files changed, 97 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> > index 3690221ba3d80..c38988778f525 100644
> > --- a/include/linux/trace_events.h
> > +++ b/include/linux/trace_events.h
> > @@ -222,6 +222,26 @@ static inline unsigned int tracing_gen_ctx_dec(void)
> > return trace_ctx;
> > }
> >
> > +/*
> > + * When PREEMPT_RT is enabled, trace events are called with disabled
> > + * migration. The trace events need to know if the tracepoint disabled
> > + * migration or not so that what is recorded to the ring buffer shows
> > + * the state of when the trace event triggered, and not the state caused
> > + * by the trace event.
> > + */
> > +#ifdef CONFIG_PREEMPT_RT
> > +static inline unsigned int tracing_gen_ctx_dec_cond(void)
> > +{
> > + unsigned int trace_ctx;
> > +
> > + trace_ctx = tracing_gen_ctx_dec();
> > + /* The migration counter starts at bit 4 */
> > + return trace_ctx - (1 << 4);
> > +}
> > +#else
> > +# define tracing_gen_ctx_dec_cond() tracing_gen_ctx_dec()
> > +#endif
> > +
> > struct trace_event_file;
> >
> > struct ring_buffer_event *
> > diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> > index 8a56f3278b1b9..0563c7d9fcb22 100644
> > --- a/include/linux/tracepoint.h
> > +++ b/include/linux/tracepoint.h
> > @@ -100,6 +100,25 @@ void for_each_tracepoint_in_module(struct module *mod,
> > }
> > #endif /* CONFIG_MODULES */
> >
> > +/*
> > + * BPF programs can attach to the tracepoint callbacks. But if the
> > + * callbacks are called with preemption disabled, the BPF programs
> > + * can cause quite a bit of latency. When PREEMPT_RT is enabled,
> > + * instead of disabling preemption, use srcu_fast_notrace() for
> > + * synchronization. As BPF programs that are attached to tracepoints
> > + * expect to stay on the same CPU, also disable migration.
> > + */
> > +#ifdef CONFIG_PREEMPT_RT
> > +extern struct srcu_struct tracepoint_srcu;
> > +# define tracepoint_sync() synchronize_srcu(&tracepoint_srcu);
> > +# define tracepoint_guard() \
> > + guard(srcu_fast_notrace)(&tracepoint_srcu); \
> > + guard(migrate)()
> > +#else
> > +# define tracepoint_sync() synchronize_rcu();
> > +# define tracepoint_guard() guard(preempt_notrace)()
> > +#endif
> > +
> > /*
> > * tracepoint_synchronize_unregister must be called between the last tracepoint
> > * probe unregistration and the end of module exit to make sure there is no
> > @@ -115,7 +134,7 @@ void for_each_tracepoint_in_module(struct module *mod,
> > static inline void tracepoint_synchronize_unregister(void)
> > {
> > synchronize_rcu_tasks_trace();
> > - synchronize_rcu();
> > + tracepoint_sync();
> > }
> > static inline bool tracepoint_is_faultable(struct tracepoint *tp)
> > {
> > @@ -275,13 +294,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
> > return static_branch_unlikely(&__tracepoint_##name.key);\
> > }
> >
> > -#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
> > +#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
> > __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \
> > static inline void __do_trace_##name(proto) \
> > { \
> > TRACEPOINT_CHECK(name) \
> > if (cond) { \
> > - guard(preempt_notrace)(); \
> > + tracepoint_guard(); \
> > __DO_TRACE_CALL(name, TP_ARGS(args)); \
> > } \
> > } \
> > diff --git a/include/trace/perf.h b/include/trace/perf.h
> > index a1754b73a8f55..348ad1d9b5566 100644
> > --- a/include/trace/perf.h
> > +++ b/include/trace/perf.h
> > @@ -71,6 +71,7 @@ perf_trace_##call(void *__data, proto) \
> > u64 __count __attribute__((unused)); \
> > struct task_struct *__task __attribute__((unused)); \
> > \
> > + guard(preempt_notrace)(); \
> > do_perf_trace_##call(__data, args); \
> > }
> >
> > @@ -85,9 +86,8 @@ perf_trace_##call(void *__data, proto) \
> > struct task_struct *__task __attribute__((unused)); \
> > \
> > might_fault(); \
> > - preempt_disable_notrace(); \
> > + guard(preempt_notrace)(); \
> > do_perf_trace_##call(__data, args); \
> > - preempt_enable_notrace(); \
> > }
> >
> > /*
> > diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
> > index 4f22136fd4656..6fb58387e9f15 100644
> > --- a/include/trace/trace_events.h
> > +++ b/include/trace/trace_events.h
> > @@ -429,6 +429,22 @@ do_trace_event_raw_event_##call(void *__data, proto) \
> > trace_event_buffer_commit(&fbuffer); \
> > }
> >
> > +/*
> > + * When PREEMPT_RT is enabled, the tracepoint does not disable preemption
> > + * but instead disables migration. The callbacks for the trace events
> > + * need to have a consistent state so that it can reflect the proper
> > + * preempt_disabled counter.
> > + */
> > +#ifdef CONFIG_PREEMPT_RT
> > +/* disable preemption for RT so that the counters still match */
> > +# define trace_event_guard() guard(preempt_notrace)()
> > +/* Have syscalls up the migrate disable counter to emulate non-syscalls */
> > +# define trace_syscall_event_guard() guard(migrate)()
> > +#else
> > +# define trace_event_guard()
> > +# define trace_syscall_event_guard()
> > +#endif
> > +
> > #undef DECLARE_EVENT_CLASS
> > #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
> > __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
> > @@ -436,6 +452,7 @@ __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
> > static notrace void \
> > trace_event_raw_event_##call(void *__data, proto) \
> > { \
> > + trace_event_guard(); \
> > do_trace_event_raw_event_##call(__data, args); \
> > }
> >
> > @@ -447,9 +464,9 @@ static notrace void \
> > trace_event_raw_event_##call(void *__data, proto) \
> > { \
> > might_fault(); \
> > - preempt_disable_notrace(); \
> > + trace_syscall_event_guard(); \
> > + guard(preempt_notrace)(); \
> > do_trace_event_raw_event_##call(__data, args); \
> > - preempt_enable_notrace(); \
> > }
> >
> > /*
> > diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> > index b16a5a1580401..2d8fd140eaf9e 100644
> > --- a/kernel/trace/trace_events.c
> > +++ b/kernel/trace/trace_events.c
> > @@ -659,13 +659,7 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
> > trace_event_ignore_this_pid(trace_file))
> > return NULL;
> >
> > - /*
> > - * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
> > - * preemption (adding one to the preempt_count). Since we are
> > - * interested in the preempt_count at the time the tracepoint was
> > - * hit, we need to subtract one to offset the increment.
> > - */
> > - fbuffer->trace_ctx = tracing_gen_ctx_dec();
> > + fbuffer->trace_ctx = tracing_gen_ctx_dec_cond();
> > fbuffer->trace_file = trace_file;
> >
> > fbuffer->event =
> > diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
> > index 62719d2941c90..6a6bcf86bfbed 100644
> > --- a/kernel/tracepoint.c
> > +++ b/kernel/tracepoint.c
> > @@ -25,6 +25,12 @@ enum tp_func_state {
> > extern tracepoint_ptr_t __start___tracepoints_ptrs[];
> > extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
> >
> > +/* In PREEMPT_RT, SRCU is used to protect the tracepoint callbacks */
> > +#ifdef CONFIG_PREEMPT_RT
> > +DEFINE_SRCU_FAST(tracepoint_srcu);
> > +EXPORT_SYMBOL_GPL(tracepoint_srcu);
> > +#endif
> > +
> > enum tp_transition_sync {
> > TP_TRANSITION_SYNC_1_0_1,
> > TP_TRANSITION_SYNC_N_2_1,
> > @@ -34,6 +40,7 @@ enum tp_transition_sync {
> >
> > struct tp_transition_snapshot {
> > unsigned long rcu;
> > + unsigned long srcu_gp;
> > bool ongoing;
> > };
> >
> > @@ -46,6 +53,9 @@ static void tp_rcu_get_state(enum tp_transition_sync sync)
> >
> > /* Keep the latest get_state snapshot. */
> > snapshot->rcu = get_state_synchronize_rcu();
> > +#ifdef CONFIG_PREEMPT_RT
> > + snapshot->srcu_gp = start_poll_synchronize_srcu(&tracepoint_srcu);
> > +#endif
> > snapshot->ongoing = true;
> > }
> >
> > @@ -56,6 +66,10 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync)
> > if (!snapshot->ongoing)
> > return;
> > cond_synchronize_rcu(snapshot->rcu);
> > +#ifdef CONFIG_PREEMPT_RT
> > + if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu_gp))
> > + synchronize_srcu(&tracepoint_srcu);
> > +#endif
> > snapshot->ongoing = false;
> > }
> >
> > @@ -101,10 +115,22 @@ static inline void *allocate_probes(int count)
> > return p == NULL ? NULL : p->probes;
> > }
> >
> > +#ifdef CONFIG_PREEMPT_RT
> > +static void srcu_free_old_probes(struct rcu_head *head)
> > +{
> > + kfree(container_of(head, struct tp_probes, rcu));
> > +}
> > +
> > +static void rcu_free_old_probes(struct rcu_head *head)
> > +{
> > + call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
> > +}
> > +#else
> > static void rcu_free_old_probes(struct rcu_head *head)
> > {
> > kfree(container_of(head, struct tp_probes, rcu));
> > }
> > +#endif
> >
> > static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old)
> > {
> > @@ -112,6 +138,13 @@ static inline void release_probes(struct tracepoint *tp, struct tracepoint_func
> > struct tp_probes *tp_probes = container_of(old,
> > struct tp_probes, probes[0]);
> >
> > + /*
> > + * Tracepoint probes are protected by either RCU or
> > + * Tasks Trace RCU and also by SRCU. By calling the SRCU
> > + * callback in the [Tasks Trace] RCU callback we cover
> > + * both cases. So let us chain the SRCU and [Tasks Trace]
> > + * RCU callbacks to wait for both grace periods.
> > + */
> > if (tracepoint_is_faultable(tp))
> > call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes);
> > else
On Thu, 11 Dec 2025 12:23:29 -0800 "Paul E. McKenney" <paulmck@kernel.org> wrote: > > Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed. > > Because in some cases the non-RT benefit is suspected to be negative > due to increasing the probability of preemption in awkward places. > > > Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com> > > Thank you, and I will let Steven collect this one. > Sure. Note, I'll be working on this next week. -- Steve
On 12/11/2025 3:23 PM, Paul E. McKenney wrote:
> On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
>>
>>
>>> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>>>
>>> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
>>> to protect invocation of __DO_TRACE_CALL() means that BPF programs
>>> attached to tracepoints are non-preemptible. This is unhelpful in
>>> real-time systems, whose users apparently wish to use BPF while also
>>> achieving low latencies. (Who knew?)
>>>
>>> One option would be to use preemptible RCU, but this introduces
>>> many opportunities for infinite recursion, which many consider to
>>> be counterproductive, especially given the relatively small stacks
>>> provided by the Linux kernel. These opportunities could be shut down
>>> by sufficiently energetic duplication of code, but this sort of thing
>>> is considered impolite in some circles.
>>>
>>> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
>>> readers than those of preemptible RCU, at least on Paul E. McKenney's
>>> laptop, where task_struct access is more expensive than access to per-CPU
>>> variables. And SRCU-fast provides way faster readers than does SRCU,
>>> courtesy of being able to avoid the read-side use of smp_mb(). Also,
>>> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
>>> functions.
>>>
>>> While in the area, SRCU now supports early boot call_srcu(). Therefore,
>>> remove the checks that used to avoid such use from rcu_free_old_probes()
>>> before this commit was applied:
>>>
>>> e53244e2c893 ("tracepoint: Remove SRCU protection")
>>>
>>> The current commit can be thought of as an approximate revert of that
>>> commit, with some compensating additions of preemption disabling.
>>> This preemption disabling uses guard(preempt_notrace)().
>>>
>>> However, Yonghong Song points out that BPF assumes that non-sleepable
>>> BPF programs will remain on the same CPU, which means that migration
>>> must be disabled whenever preemption remains enabled. In addition,
>>> non-RT kernels have performance expectations that would be violated by
>>> allowing the BPF programs to be preempted.
>>>
>>> Therefore, continue to disable preemption in non-RT kernels, and protect
>>> the BPF program with both SRCU and migration disabling for RT kernels,
>>> and even then only if preemption is not already disabled.
>>
>> Hi Paul,
>>
>> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
>
> Because in some cases the non-RT benefit is suspected to be negative
> due to increasing the probability of preemption in awkward places.
>
Since you mentioned suspected, I am guessing there is no concrete data collected
to substantiate that specifically for BPF programs, but correct me if I missed
something. Assuming you're referring to latency versus tradeoffs issues, due to
preemption, Android is not PREEMPT_RT but is expected to be low latency in
general as well. So is this decision the right one for Android as well,
considering that (I heard) it uses BPF? Just an open-ended question.
There is also issue of 2 different paths for PREEMPT_RT versus otherwise,
complicating the tracing side so there better be a reason for that I guess.
thanks,
- Joel
On Fri, Dec 12, 2025 at 09:12:07AM +0900, Joel Fernandes wrote:
>
>
> On 12/11/2025 3:23 PM, Paul E. McKenney wrote:
> > On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
> >>
> >>
> >>> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >>>
> >>> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
> >>> to protect invocation of __DO_TRACE_CALL() means that BPF programs
> >>> attached to tracepoints are non-preemptible. This is unhelpful in
> >>> real-time systems, whose users apparently wish to use BPF while also
> >>> achieving low latencies. (Who knew?)
> >>>
> >>> One option would be to use preemptible RCU, but this introduces
> >>> many opportunities for infinite recursion, which many consider to
> >>> be counterproductive, especially given the relatively small stacks
> >>> provided by the Linux kernel. These opportunities could be shut down
> >>> by sufficiently energetic duplication of code, but this sort of thing
> >>> is considered impolite in some circles.
> >>>
> >>> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
> >>> readers than those of preemptible RCU, at least on Paul E. McKenney's
> >>> laptop, where task_struct access is more expensive than access to per-CPU
> >>> variables. And SRCU-fast provides way faster readers than does SRCU,
> >>> courtesy of being able to avoid the read-side use of smp_mb(). Also,
> >>> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
> >>> functions.
> >>>
> >>> While in the area, SRCU now supports early boot call_srcu(). Therefore,
> >>> remove the checks that used to avoid such use from rcu_free_old_probes()
> >>> before this commit was applied:
> >>>
> >>> e53244e2c893 ("tracepoint: Remove SRCU protection")
> >>>
> >>> The current commit can be thought of as an approximate revert of that
> >>> commit, with some compensating additions of preemption disabling.
> >>> This preemption disabling uses guard(preempt_notrace)().
> >>>
> >>> However, Yonghong Song points out that BPF assumes that non-sleepable
> >>> BPF programs will remain on the same CPU, which means that migration
> >>> must be disabled whenever preemption remains enabled. In addition,
> >>> non-RT kernels have performance expectations that would be violated by
> >>> allowing the BPF programs to be preempted.
> >>>
> >>> Therefore, continue to disable preemption in non-RT kernels, and protect
> >>> the BPF program with both SRCU and migration disabling for RT kernels,
> >>> and even then only if preemption is not already disabled.
> >>
> >> Hi Paul,
> >>
> >> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
> >
> > Because in some cases the non-RT benefit is suspected to be negative
> > due to increasing the probability of preemption in awkward places.
>
> Since you mentioned suspected, I am guessing there is no concrete data collected
> to substantiate that specifically for BPF programs, but correct me if I missed
> something. Assuming you're referring to latency versus tradeoffs issues, due to
> preemption, Android is not PREEMPT_RT but is expected to be low latency in
> general as well. So is this decision the right one for Android as well,
> considering that (I heard) it uses BPF? Just an open-ended question.
>
> There is also issue of 2 different paths for PREEMPT_RT versus otherwise,
> complicating the tracing side so there better be a reason for that I guess.
You are advocating a change in behavior for non-RT workloads. Why do
you believe that this change would be OK for those workloads?
Thanx, Paul
> On Dec 12, 2025, at 9:47 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Fri, Dec 12, 2025 at 09:12:07AM +0900, Joel Fernandes wrote:
>>
>>
>>> On 12/11/2025 3:23 PM, Paul E. McKenney wrote:
>>> On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
>>>>
>>>>
>>>>> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>>>>>
>>>>> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
>>>>> to protect invocation of __DO_TRACE_CALL() means that BPF programs
>>>>> attached to tracepoints are non-preemptible. This is unhelpful in
>>>>> real-time systems, whose users apparently wish to use BPF while also
>>>>> achieving low latencies. (Who knew?)
>>>>>
>>>>> One option would be to use preemptible RCU, but this introduces
>>>>> many opportunities for infinite recursion, which many consider to
>>>>> be counterproductive, especially given the relatively small stacks
>>>>> provided by the Linux kernel. These opportunities could be shut down
>>>>> by sufficiently energetic duplication of code, but this sort of thing
>>>>> is considered impolite in some circles.
>>>>>
>>>>> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
>>>>> readers than those of preemptible RCU, at least on Paul E. McKenney's
>>>>> laptop, where task_struct access is more expensive than access to per-CPU
>>>>> variables. And SRCU-fast provides way faster readers than does SRCU,
>>>>> courtesy of being able to avoid the read-side use of smp_mb(). Also,
>>>>> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
>>>>> functions.
>>>>>
>>>>> While in the area, SRCU now supports early boot call_srcu(). Therefore,
>>>>> remove the checks that used to avoid such use from rcu_free_old_probes()
>>>>> before this commit was applied:
>>>>>
>>>>> e53244e2c893 ("tracepoint: Remove SRCU protection")
>>>>>
>>>>> The current commit can be thought of as an approximate revert of that
>>>>> commit, with some compensating additions of preemption disabling.
>>>>> This preemption disabling uses guard(preempt_notrace)().
>>>>>
>>>>> However, Yonghong Song points out that BPF assumes that non-sleepable
>>>>> BPF programs will remain on the same CPU, which means that migration
>>>>> must be disabled whenever preemption remains enabled. In addition,
>>>>> non-RT kernels have performance expectations that would be violated by
>>>>> allowing the BPF programs to be preempted.
>>>>>
>>>>> Therefore, continue to disable preemption in non-RT kernels, and protect
>>>>> the BPF program with both SRCU and migration disabling for RT kernels,
>>>>> and even then only if preemption is not already disabled.
>>>>
>>>> Hi Paul,
>>>>
>>>> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
>>>
>>> Because in some cases the non-RT benefit is suspected to be negative
>>> due to increasing the probability of preemption in awkward places.
>>
>> Since you mentioned suspected, I am guessing there is no concrete data collected
>> to substantiate that specifically for BPF programs, but correct me if I missed
>> something. Assuming you're referring to latency versus tradeoffs issues, due to
>> preemption, Android is not PREEMPT_RT but is expected to be low latency in
>> general as well. So is this decision the right one for Android as well,
>> considering that (I heard) it uses BPF? Just an open-ended question.
>>
>> There is also issue of 2 different paths for PREEMPT_RT versus otherwise,
>> complicating the tracing side so there better be a reason for that I guess.
>
> You are advocating a change in behavior for non-RT workloads. Why do
> you believe that this change would be OK for those workloads?
Same reasons I provided in my last email. If we are saying SRCU-fast is required for lower latency, I find it strange that we are leaving out Android which has low latency audio usecases, for instance.
Thanks,
- Joel
>
> Thanx, Paul
On Fri, Dec 12, 2025 at 03:43:07AM +0000, Joel Fernandes wrote:
>
>
> > On Dec 12, 2025, at 9:47 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Fri, Dec 12, 2025 at 09:12:07AM +0900, Joel Fernandes wrote:
> >>
> >>
> >>> On 12/11/2025 3:23 PM, Paul E. McKenney wrote:
> >>> On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
> >>>>
> >>>>
> >>>>> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >>>>>
> >>>>> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
> >>>>> to protect invocation of __DO_TRACE_CALL() means that BPF programs
> >>>>> attached to tracepoints are non-preemptible. This is unhelpful in
> >>>>> real-time systems, whose users apparently wish to use BPF while also
> >>>>> achieving low latencies. (Who knew?)
> >>>>>
> >>>>> One option would be to use preemptible RCU, but this introduces
> >>>>> many opportunities for infinite recursion, which many consider to
> >>>>> be counterproductive, especially given the relatively small stacks
> >>>>> provided by the Linux kernel. These opportunities could be shut down
> >>>>> by sufficiently energetic duplication of code, but this sort of thing
> >>>>> is considered impolite in some circles.
> >>>>>
> >>>>> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
> >>>>> readers than those of preemptible RCU, at least on Paul E. McKenney's
> >>>>> laptop, where task_struct access is more expensive than access to per-CPU
> >>>>> variables. And SRCU-fast provides way faster readers than does SRCU,
> >>>>> courtesy of being able to avoid the read-side use of smp_mb(). Also,
> >>>>> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
> >>>>> functions.
> >>>>>
> >>>>> While in the area, SRCU now supports early boot call_srcu(). Therefore,
> >>>>> remove the checks that used to avoid such use from rcu_free_old_probes()
> >>>>> before this commit was applied:
> >>>>>
> >>>>> e53244e2c893 ("tracepoint: Remove SRCU protection")
> >>>>>
> >>>>> The current commit can be thought of as an approximate revert of that
> >>>>> commit, with some compensating additions of preemption disabling.
> >>>>> This preemption disabling uses guard(preempt_notrace)().
> >>>>>
> >>>>> However, Yonghong Song points out that BPF assumes that non-sleepable
> >>>>> BPF programs will remain on the same CPU, which means that migration
> >>>>> must be disabled whenever preemption remains enabled. In addition,
> >>>>> non-RT kernels have performance expectations that would be violated by
> >>>>> allowing the BPF programs to be preempted.
> >>>>>
> >>>>> Therefore, continue to disable preemption in non-RT kernels, and protect
> >>>>> the BPF program with both SRCU and migration disabling for RT kernels,
> >>>>> and even then only if preemption is not already disabled.
> >>>>
> >>>> Hi Paul,
> >>>>
> >>>> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
> >>>
> >>> Because in some cases the non-RT benefit is suspected to be negative
> >>> due to increasing the probability of preemption in awkward places.
> >>
> >> Since you mentioned suspected, I am guessing there is no concrete data collected
> >> to substantiate that specifically for BPF programs, but correct me if I missed
> >> something. Assuming you're referring to latency versus tradeoffs issues, due to
> >> preemption, Android is not PREEMPT_RT but is expected to be low latency in
> >> general as well. So is this decision the right one for Android as well,
> >> considering that (I heard) it uses BPF? Just an open-ended question.
> >>
> >> There is also issue of 2 different paths for PREEMPT_RT versus otherwise,
> >> complicating the tracing side so there better be a reason for that I guess.
> >
> > You are advocating a change in behavior for non-RT workloads. Why do
> > you believe that this change would be OK for those workloads?
>
> Same reasons I provided in my last email. If we are saying SRCU-fast is required for lower latency, I find it strange that we are leaving out Android which has low latency audio usecases, for instance.
If Android provides numbers showing that it helps them, then it is easy
to provide a Kconfig option that defaults to PREEMPT_RT, but that Android
can override. Right?
Thanx, Paul
> Thanks,
>
> - Joel
>
>
> >
> > Thanx, Paul
> On Dec 12, 2025, at 4:50 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Fri, Dec 12, 2025 at 03:43:07AM +0000, Joel Fernandes wrote:
>>
>>
>>>> On Dec 12, 2025, at 9:47 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
>>>
>>> On Fri, Dec 12, 2025 at 09:12:07AM +0900, Joel Fernandes wrote:
>>>>
>>>>
>>>>> On 12/11/2025 3:23 PM, Paul E. McKenney wrote:
>>>>> On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
>>>>>>
>>>>>>
>>>>>>> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>>>>>>>
>>>>>>> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
>>>>>>> to protect invocation of __DO_TRACE_CALL() means that BPF programs
>>>>>>> attached to tracepoints are non-preemptible. This is unhelpful in
>>>>>>> real-time systems, whose users apparently wish to use BPF while also
>>>>>>> achieving low latencies. (Who knew?)
>>>>>>>
>>>>>>> One option would be to use preemptible RCU, but this introduces
>>>>>>> many opportunities for infinite recursion, which many consider to
>>>>>>> be counterproductive, especially given the relatively small stacks
>>>>>>> provided by the Linux kernel. These opportunities could be shut down
>>>>>>> by sufficiently energetic duplication of code, but this sort of thing
>>>>>>> is considered impolite in some circles.
>>>>>>>
>>>>>>> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
>>>>>>> readers than those of preemptible RCU, at least on Paul E. McKenney's
>>>>>>> laptop, where task_struct access is more expensive than access to per-CPU
>>>>>>> variables. And SRCU-fast provides way faster readers than does SRCU,
>>>>>>> courtesy of being able to avoid the read-side use of smp_mb(). Also,
>>>>>>> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
>>>>>>> functions.
>>>>>>>
>>>>>>> While in the area, SRCU now supports early boot call_srcu(). Therefore,
>>>>>>> remove the checks that used to avoid such use from rcu_free_old_probes()
>>>>>>> before this commit was applied:
>>>>>>>
>>>>>>> e53244e2c893 ("tracepoint: Remove SRCU protection")
>>>>>>>
>>>>>>> The current commit can be thought of as an approximate revert of that
>>>>>>> commit, with some compensating additions of preemption disabling.
>>>>>>> This preemption disabling uses guard(preempt_notrace)().
>>>>>>>
>>>>>>> However, Yonghong Song points out that BPF assumes that non-sleepable
>>>>>>> BPF programs will remain on the same CPU, which means that migration
>>>>>>> must be disabled whenever preemption remains enabled. In addition,
>>>>>>> non-RT kernels have performance expectations that would be violated by
>>>>>>> allowing the BPF programs to be preempted.
>>>>>>>
>>>>>>> Therefore, continue to disable preemption in non-RT kernels, and protect
>>>>>>> the BPF program with both SRCU and migration disabling for RT kernels,
>>>>>>> and even then only if preemption is not already disabled.
>>>>>>
>>>>>> Hi Paul,
>>>>>>
>>>>>> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
>>>>>
>>>>> Because in some cases the non-RT benefit is suspected to be negative
>>>>> due to increasing the probability of preemption in awkward places.
>>>>
>>>> Since you mentioned suspected, I am guessing there is no concrete data collected
>>>> to substantiate that specifically for BPF programs, but correct me if I missed
>>>> something. Assuming you're referring to latency versus tradeoffs issues, due to
>>>> preemption, Android is not PREEMPT_RT but is expected to be low latency in
>>>> general as well. So is this decision the right one for Android as well,
>>>> considering that (I heard) it uses BPF? Just an open-ended question.
>>>>
>>>> There is also issue of 2 different paths for PREEMPT_RT versus otherwise,
>>>> complicating the tracing side so there better be a reason for that I guess.
>>>
>>> You are advocating a change in behavior for non-RT workloads. Why do
>>> you believe that this change would be OK for those workloads?
>>
>> Same reasons I provided in my last email. If we are saying SRCU-fast is required for lower latency, I find it strange that we are leaving out Android which has low latency audio usecases, for instance.
>
> If Android provides numbers showing that it helps them, then it is easy
> to provide a Kconfig option that defaults to PREEMPT_RT, but that Android
> can override. Right?
Sure, but my suspicion is Android or others are not going to look into every PREEMPT_RT specific optimization (not just this one) and see if it benefits their interactivity usecases. They will simply miss out on it without knowing they are.
It might be a good idea (for me) to explore how many such optimizations exist though, that we take for granted. I will look into exploring this on my side. :)
thanks,
- Joel
>
> Thanx, Paul
>
>> Thanks,
>>
>> - Joel
>>
>>
>>>
>>> Thanx, Paul
On Fri, Dec 12, 2025 at 09:28:37AM +0000, Joel Fernandes wrote:
>
>
> > On Dec 12, 2025, at 4:50 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Fri, Dec 12, 2025 at 03:43:07AM +0000, Joel Fernandes wrote:
> >>
> >>
> >>>> On Dec 12, 2025, at 9:47 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >>>
> >>> On Fri, Dec 12, 2025 at 09:12:07AM +0900, Joel Fernandes wrote:
> >>>>
> >>>>
> >>>>> On 12/11/2025 3:23 PM, Paul E. McKenney wrote:
> >>>>> On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
> >>>>>>
> >>>>>>
> >>>>>>> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >>>>>>>
> >>>>>>> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
> >>>>>>> to protect invocation of __DO_TRACE_CALL() means that BPF programs
> >>>>>>> attached to tracepoints are non-preemptible. This is unhelpful in
> >>>>>>> real-time systems, whose users apparently wish to use BPF while also
> >>>>>>> achieving low latencies. (Who knew?)
> >>>>>>>
> >>>>>>> One option would be to use preemptible RCU, but this introduces
> >>>>>>> many opportunities for infinite recursion, which many consider to
> >>>>>>> be counterproductive, especially given the relatively small stacks
> >>>>>>> provided by the Linux kernel. These opportunities could be shut down
> >>>>>>> by sufficiently energetic duplication of code, but this sort of thing
> >>>>>>> is considered impolite in some circles.
> >>>>>>>
> >>>>>>> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
> >>>>>>> readers than those of preemptible RCU, at least on Paul E. McKenney's
> >>>>>>> laptop, where task_struct access is more expensive than access to per-CPU
> >>>>>>> variables. And SRCU-fast provides way faster readers than does SRCU,
> >>>>>>> courtesy of being able to avoid the read-side use of smp_mb(). Also,
> >>>>>>> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
> >>>>>>> functions.
> >>>>>>>
> >>>>>>> While in the area, SRCU now supports early boot call_srcu(). Therefore,
> >>>>>>> remove the checks that used to avoid such use from rcu_free_old_probes()
> >>>>>>> before this commit was applied:
> >>>>>>>
> >>>>>>> e53244e2c893 ("tracepoint: Remove SRCU protection")
> >>>>>>>
> >>>>>>> The current commit can be thought of as an approximate revert of that
> >>>>>>> commit, with some compensating additions of preemption disabling.
> >>>>>>> This preemption disabling uses guard(preempt_notrace)().
> >>>>>>>
> >>>>>>> However, Yonghong Song points out that BPF assumes that non-sleepable
> >>>>>>> BPF programs will remain on the same CPU, which means that migration
> >>>>>>> must be disabled whenever preemption remains enabled. In addition,
> >>>>>>> non-RT kernels have performance expectations that would be violated by
> >>>>>>> allowing the BPF programs to be preempted.
> >>>>>>>
> >>>>>>> Therefore, continue to disable preemption in non-RT kernels, and protect
> >>>>>>> the BPF program with both SRCU and migration disabling for RT kernels,
> >>>>>>> and even then only if preemption is not already disabled.
> >>>>>>
> >>>>>> Hi Paul,
> >>>>>>
> >>>>>> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
> >>>>>
> >>>>> Because in some cases the non-RT benefit is suspected to be negative
> >>>>> due to increasing the probability of preemption in awkward places.
> >>>>
> >>>> Since you mentioned suspected, I am guessing there is no concrete data collected
> >>>> to substantiate that specifically for BPF programs, but correct me if I missed
> >>>> something. Assuming you're referring to latency versus tradeoffs issues, due to
> >>>> preemption, Android is not PREEMPT_RT but is expected to be low latency in
> >>>> general as well. So is this decision the right one for Android as well,
> >>>> considering that (I heard) it uses BPF? Just an open-ended question.
> >>>>
> >>>> There is also issue of 2 different paths for PREEMPT_RT versus otherwise,
> >>>> complicating the tracing side so there better be a reason for that I guess.
> >>>
> >>> You are advocating a change in behavior for non-RT workloads. Why do
> >>> you believe that this change would be OK for those workloads?
> >>
> >> Same reasons I provided in my last email. If we are saying SRCU-fast is required for lower latency, I find it strange that we are leaving out Android which has low latency audio usecases, for instance.
> >
> > If Android provides numbers showing that it helps them, then it is easy
> > to provide a Kconfig option that defaults to PREEMPT_RT, but that Android
> > can override. Right?
>
> Sure, but my suspicion is Android or others are not going to look into every PREEMPT_RT specific optimization (not just this one) and see if it benefits their interactivity usecases. They will simply miss out on it without knowing they are.
>
> It might be a good idea (for me) to explore how many such optimizations exist though, that we take for granted. I will look into exploring this on my side. :)
One workload's optimization is another workload's pessimization, in
part because there are a lot of different measures of performance that
different workloads care about..
But as a practical matter, this is Steven's decision.
Though if he does change the behavior on non-RT setups, I would thank
him to remove my name from the commit, or at least record in the commit
log that I object to changing other workloads' behaviors.
Thanx, Paul
> thanks,
>
> - Joel
>
> >
> > Thanx, Paul
> >
> >> Thanks,
> >>
> >> - Joel
> >>
> >>
> >>>
> >>> Thanx, Paul
> On Dec 13, 2025, at 8:10 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Fri, Dec 12, 2025 at 09:28:37AM +0000, Joel Fernandes wrote:
>>
>>
>>>> On Dec 12, 2025, at 4:50 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>>>
>>> On Fri, Dec 12, 2025 at 03:43:07AM +0000, Joel Fernandes wrote:
>>>>
>>>>
>>>>>> On Dec 12, 2025, at 9:47 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
>>>>>
>>>>> On Fri, Dec 12, 2025 at 09:12:07AM +0900, Joel Fernandes wrote:
>>>>>>
>>>>>>
>>>>>>> On 12/11/2025 3:23 PM, Paul E. McKenney wrote:
>>>>>>> On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>>>>>>>>>
>>>>>>>>> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
>>>>>>>>> to protect invocation of __DO_TRACE_CALL() means that BPF programs
>>>>>>>>> attached to tracepoints are non-preemptible. This is unhelpful in
>>>>>>>>> real-time systems, whose users apparently wish to use BPF while also
>>>>>>>>> achieving low latencies. (Who knew?)
>>>>>>>>>
>>>>>>>>> One option would be to use preemptible RCU, but this introduces
>>>>>>>>> many opportunities for infinite recursion, which many consider to
>>>>>>>>> be counterproductive, especially given the relatively small stacks
>>>>>>>>> provided by the Linux kernel. These opportunities could be shut down
>>>>>>>>> by sufficiently energetic duplication of code, but this sort of thing
>>>>>>>>> is considered impolite in some circles.
>>>>>>>>>
>>>>>>>>> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
>>>>>>>>> readers than those of preemptible RCU, at least on Paul E. McKenney's
>>>>>>>>> laptop, where task_struct access is more expensive than access to per-CPU
>>>>>>>>> variables. And SRCU-fast provides way faster readers than does SRCU,
>>>>>>>>> courtesy of being able to avoid the read-side use of smp_mb(). Also,
>>>>>>>>> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
>>>>>>>>> functions.
>>>>>>>>>
>>>>>>>>> While in the area, SRCU now supports early boot call_srcu(). Therefore,
>>>>>>>>> remove the checks that used to avoid such use from rcu_free_old_probes()
>>>>>>>>> before this commit was applied:
>>>>>>>>>
>>>>>>>>> e53244e2c893 ("tracepoint: Remove SRCU protection")
>>>>>>>>>
>>>>>>>>> The current commit can be thought of as an approximate revert of that
>>>>>>>>> commit, with some compensating additions of preemption disabling.
>>>>>>>>> This preemption disabling uses guard(preempt_notrace)().
>>>>>>>>>
>>>>>>>>> However, Yonghong Song points out that BPF assumes that non-sleepable
>>>>>>>>> BPF programs will remain on the same CPU, which means that migration
>>>>>>>>> must be disabled whenever preemption remains enabled. In addition,
>>>>>>>>> non-RT kernels have performance expectations that would be violated by
>>>>>>>>> allowing the BPF programs to be preempted.
>>>>>>>>>
>>>>>>>>> Therefore, continue to disable preemption in non-RT kernels, and protect
>>>>>>>>> the BPF program with both SRCU and migration disabling for RT kernels,
>>>>>>>>> and even then only if preemption is not already disabled.
>>>>>>>>
>>>>>>>> Hi Paul,
>>>>>>>>
>>>>>>>> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
>>>>>>>
>>>>>>> Because in some cases the non-RT benefit is suspected to be negative
>>>>>>> due to increasing the probability of preemption in awkward places.
>>>>>>
>>>>>> Since you mentioned suspected, I am guessing there is no concrete data collected
>>>>>> to substantiate that specifically for BPF programs, but correct me if I missed
>>>>>> something. Assuming you're referring to latency versus tradeoffs issues, due to
>>>>>> preemption, Android is not PREEMPT_RT but is expected to be low latency in
>>>>>> general as well. So is this decision the right one for Android as well,
>>>>>> considering that (I heard) it uses BPF? Just an open-ended question.
>>>>>>
>>>>>> There is also issue of 2 different paths for PREEMPT_RT versus otherwise,
>>>>>> complicating the tracing side so there better be a reason for that I guess.
>>>>>
>>>>> You are advocating a change in behavior for non-RT workloads. Why do
>>>>> you believe that this change would be OK for those workloads?
>>>>
>>>> Same reasons I provided in my last email. If we are saying SRCU-fast is required for lower latency, I find it strange that we are leaving out Android which has low latency audio usecases, for instance.
>>>
>>> If Android provides numbers showing that it helps them, then it is easy
>>> to provide a Kconfig option that defaults to PREEMPT_RT, but that Android
>>> can override. Right?
>>
>> Sure, but my suspicion is Android or others are not going to look into every PREEMPT_RT specific optimization (not just this one) and see if it benefits their interactivity usecases. They will simply miss out on it without knowing they are.
>>
>> It might be a good idea (for me) to explore how many such optimizations exist though, that we take for granted. I will look into exploring this on my side. :)
>
> One workload's optimization is another workload's pessimization, in
> part because there are a lot of different measures of performance that
> different workloads care about..
>
> But as a practical matter, this is Steven's decision.
>
> Though if he does change the behavior on non-RT setups, I would thank
> him to remove my name from the commit, or at least record in the commit
> log that I object to changing other workloads' behaviors.
You have a point. I am not saying we should do this for sure but should at least consider / explore it.
Thanks.
>
> Thanx, Paul
>
>> thanks,
>>
>> - Joel
>>
>>>
>>> Thanx, Paul
>>>
>>>> Thanks,
>>>>
>>>> - Joel
>>>>
>>>>
>>>>>
>>>>> Thanx, Paul
On Fri, Dec 12, 2025 at 11:54:28PM +0000, Joel Fernandes wrote:
>
>
> > On Dec 13, 2025, at 8:10 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Fri, Dec 12, 2025 at 09:28:37AM +0000, Joel Fernandes wrote:
> >>
> >>
> >>>> On Dec 12, 2025, at 4:50 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >>>
> >>> On Fri, Dec 12, 2025 at 03:43:07AM +0000, Joel Fernandes wrote:
> >>>>
> >>>>
> >>>>>> On Dec 12, 2025, at 9:47 AM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >>>>>
> >>>>> On Fri, Dec 12, 2025 at 09:12:07AM +0900, Joel Fernandes wrote:
> >>>>>>
> >>>>>>
> >>>>>>> On 12/11/2025 3:23 PM, Paul E. McKenney wrote:
> >>>>>>> On Thu, Dec 11, 2025 at 08:02:15PM +0000, Joel Fernandes wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>> On Dec 8, 2025, at 1:20 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
> >>>>>>>>>
> >>>>>>>>> The current use of guard(preempt_notrace)() within __DECLARE_TRACE()
> >>>>>>>>> to protect invocation of __DO_TRACE_CALL() means that BPF programs
> >>>>>>>>> attached to tracepoints are non-preemptible. This is unhelpful in
> >>>>>>>>> real-time systems, whose users apparently wish to use BPF while also
> >>>>>>>>> achieving low latencies. (Who knew?)
> >>>>>>>>>
> >>>>>>>>> One option would be to use preemptible RCU, but this introduces
> >>>>>>>>> many opportunities for infinite recursion, which many consider to
> >>>>>>>>> be counterproductive, especially given the relatively small stacks
> >>>>>>>>> provided by the Linux kernel. These opportunities could be shut down
> >>>>>>>>> by sufficiently energetic duplication of code, but this sort of thing
> >>>>>>>>> is considered impolite in some circles.
> >>>>>>>>>
> >>>>>>>>> Therefore, use the shiny new SRCU-fast API, which provides somewhat faster
> >>>>>>>>> readers than those of preemptible RCU, at least on Paul E. McKenney's
> >>>>>>>>> laptop, where task_struct access is more expensive than access to per-CPU
> >>>>>>>>> variables. And SRCU-fast provides way faster readers than does SRCU,
> >>>>>>>>> courtesy of being able to avoid the read-side use of smp_mb(). Also,
> >>>>>>>>> it is quite straightforward to create srcu_read_{,un}lock_fast_notrace()
> >>>>>>>>> functions.
> >>>>>>>>>
> >>>>>>>>> While in the area, SRCU now supports early boot call_srcu(). Therefore,
> >>>>>>>>> remove the checks that used to avoid such use from rcu_free_old_probes()
> >>>>>>>>> before this commit was applied:
> >>>>>>>>>
> >>>>>>>>> e53244e2c893 ("tracepoint: Remove SRCU protection")
> >>>>>>>>>
> >>>>>>>>> The current commit can be thought of as an approximate revert of that
> >>>>>>>>> commit, with some compensating additions of preemption disabling.
> >>>>>>>>> This preemption disabling uses guard(preempt_notrace)().
> >>>>>>>>>
> >>>>>>>>> However, Yonghong Song points out that BPF assumes that non-sleepable
> >>>>>>>>> BPF programs will remain on the same CPU, which means that migration
> >>>>>>>>> must be disabled whenever preemption remains enabled. In addition,
> >>>>>>>>> non-RT kernels have performance expectations that would be violated by
> >>>>>>>>> allowing the BPF programs to be preempted.
> >>>>>>>>>
> >>>>>>>>> Therefore, continue to disable preemption in non-RT kernels, and protect
> >>>>>>>>> the BPF program with both SRCU and migration disabling for RT kernels,
> >>>>>>>>> and even then only if preemption is not already disabled.
> >>>>>>>>
> >>>>>>>> Hi Paul,
> >>>>>>>>
> >>>>>>>> Is there a reason to not make non-RT also benefit from SRCU fast and trace points for BPF? Can be a follow up patch though if needed.
> >>>>>>>
> >>>>>>> Because in some cases the non-RT benefit is suspected to be negative
> >>>>>>> due to increasing the probability of preemption in awkward places.
> >>>>>>
> >>>>>> Since you mentioned suspected, I am guessing there is no concrete data collected
> >>>>>> to substantiate that specifically for BPF programs, but correct me if I missed
> >>>>>> something. Assuming you're referring to latency versus tradeoffs issues, due to
> >>>>>> preemption, Android is not PREEMPT_RT but is expected to be low latency in
> >>>>>> general as well. So is this decision the right one for Android as well,
> >>>>>> considering that (I heard) it uses BPF? Just an open-ended question.
> >>>>>>
> >>>>>> There is also issue of 2 different paths for PREEMPT_RT versus otherwise,
> >>>>>> complicating the tracing side so there better be a reason for that I guess.
> >>>>>
> >>>>> You are advocating a change in behavior for non-RT workloads. Why do
> >>>>> you believe that this change would be OK for those workloads?
> >>>>
> >>>> Same reasons I provided in my last email. If we are saying SRCU-fast is required for lower latency, I find it strange that we are leaving out Android which has low latency audio usecases, for instance.
> >>>
> >>> If Android provides numbers showing that it helps them, then it is easy
> >>> to provide a Kconfig option that defaults to PREEMPT_RT, but that Android
> >>> can override. Right?
> >>
> >> Sure, but my suspicion is Android or others are not going to look into every PREEMPT_RT specific optimization (not just this one) and see if it benefits their interactivity usecases. They will simply miss out on it without knowing they are.
> >>
> >> It might be a good idea (for me) to explore how many such optimizations exist though, that we take for granted. I will look into exploring this on my side. :)
> >
> > One workload's optimization is another workload's pessimization, in
> > part because there are a lot of different measures of performance that
> > different workloads care about..
> >
> > But as a practical matter, this is Steven's decision.
> >
> > Though if he does change the behavior on non-RT setups, I would thank
> > him to remove my name from the commit, or at least record in the commit
> > log that I object to changing other workloads' behaviors.
>
> You have a point. I am not saying we should do this for sure but should at least consider / explore it.
Now *that* I have no problem with, as long as the consideration and
exploration is very public and includes the usual BPF/tracing suspects.
Thanx, Paul
> Thanks.
>
>
>
> >
> > Thanx, Paul
> >
> >> thanks,
> >>
> >> - Joel
> >>
> >>>
> >>> Thanx, Paul
> >>>
> >>>> Thanks,
> >>>>
> >>>> - Joel
> >>>>
> >>>>
> >>>>>
> >>>>> Thanx, Paul
On Fri, 12 Dec 2025 16:06:09 -0800 "Paul E. McKenney" <paulmck@kernel.org> wrote: > Now *that* I have no problem with, as long as the consideration and > exploration is very public and includes the usual BPF/tracing suspects. So we are all set then ;-) As I talked with both of you, I'll just reinstate my thoughts on the patch here and make it public. I agree with Joel that it would be better to have consistency between RT and non-RT. I agree with Paul that I do not want to add possible regressions for the sake of consistency. Thus, I'm going to keep this a PREEMPT_RT only change. If someone can come in and convince us that the PREEMPT_RT way is also beneficial for the non-RT case then we can make it consistent again. Until then, this change is focusing on fixing PREEMPT_RT, and that's what the patch is going to be limited to. Thanks for the discussion, -- Steve PS. I have a working patch, but since I've been busy running a conference, I haven't had the time to vet it enough for public consumption.
On 2025-12-12 21:18, Steven Rostedt wrote: [...] > > Thus, I'm going to keep this a PREEMPT_RT only change. If someone can > come in and convince us that the PREEMPT_RT way is also beneficial for > the non-RT case then we can make it consistent again. Until then, this > change is focusing on fixing PREEMPT_RT, and that's what the patch is > going to be limited to. Here is one additional thing to keep in mind: although SRCU-fast is probably quite fast (as the name implies), last time I tried using migrate disable in a fast path I was surprised to see verbosity of the generated assembly, and how slow it was compared to preempt disable. So before using migrate disable on a fast path, at least on non-preempt-RT configs, we should carefully consider the performance impact of migrate disable. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Fri, 12 Dec 2025 23:19:41 -0500 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > Here is one additional thing to keep in mind: although > SRCU-fast is probably quite fast (as the name implies), > last time I tried using migrate disable in a fast path > I was surprised to see verbosity of the generated assembly, > and how slow it was compared to preempt disable. > > So before using migrate disable on a fast path, at least on > non-preempt-RT configs, we should carefully consider the > performance impact of migrate disable. Right, which is another reason I'm keeping that part of the patch as-is. -- Steve
On Sun, 7 Dec 2025 20:20:23 -0800
"Paul E. McKenney" <paulmck@kernel.org> wrote:
> [ paulmck: Remove trace_syscalls.h changes per Steven Rostedt. ]
But they still need to be fixed.
With PREEMPT_RT enabled:
# trace-cmd start -e syscalls
# trace-cmd show
bash-1165 [001] DBZ.f 269.955644: sys_ioctl(fd: 0xff, cmd: 0x5413, arg: 0x7fffd3d6a2d0)
bash-1165 [001] DBZ.f 269.955649: sys_ioctl -> 0x0
bash-1165 [001] DBZ.f 269.955694: sys_rt_sigprocmask(how: 2, nset: 0x7fffd3d6a3a0, oset: 0, sigsetsize: 8)
bash-1165 [001] DBZ.f 269.955698: sys_rt_sigprocmask -> 0x0
bash-1165 [001] DBZ.f 269.955715: sys_wait4(upid: 0xffffffffffffffff, stat_addr: 0x7fffd3d69c50, options: 0xb, ru: 0)
bash-1165 [001] DBZ.f 269.955722: sys_wait4 -> 0xfffffffffffffff6
bash-1165 [001] DBZ.f 269.955725: sys_rt_sigreturn()
bash-1165 [001] DBZ.f 269.955758: sys_rt_sigaction(sig: 2, act: 0x7fffd3d6a2e0, oact: 0x7fffd3d6a380, sigsetsize: 8)
bash-1165 [001] DBZ.f 269.955762: sys_rt_sigaction -> 0x0
^^^^^
This is just garbage.
-- Steve
On Mon, Dec 08, 2025 at 04:43:52AM -0500, Steven Rostedt wrote: > On Sun, 7 Dec 2025 20:20:23 -0800 > "Paul E. McKenney" <paulmck@kernel.org> wrote: > > > [ paulmck: Remove trace_syscalls.h changes per Steven Rostedt. ] > > But they still need to be fixed. > > With PREEMPT_RT enabled: > > # trace-cmd start -e syscalls > # trace-cmd show > bash-1165 [001] DBZ.f 269.955644: sys_ioctl(fd: 0xff, cmd: 0x5413, arg: 0x7fffd3d6a2d0) > bash-1165 [001] DBZ.f 269.955649: sys_ioctl -> 0x0 > bash-1165 [001] DBZ.f 269.955694: sys_rt_sigprocmask(how: 2, nset: 0x7fffd3d6a3a0, oset: 0, sigsetsize: 8) > bash-1165 [001] DBZ.f 269.955698: sys_rt_sigprocmask -> 0x0 > bash-1165 [001] DBZ.f 269.955715: sys_wait4(upid: 0xffffffffffffffff, stat_addr: 0x7fffd3d69c50, options: 0xb, ru: 0) > bash-1165 [001] DBZ.f 269.955722: sys_wait4 -> 0xfffffffffffffff6 > bash-1165 [001] DBZ.f 269.955725: sys_rt_sigreturn() > bash-1165 [001] DBZ.f 269.955758: sys_rt_sigaction(sig: 2, act: 0x7fffd3d6a2e0, oact: 0x7fffd3d6a380, sigsetsize: 8) > bash-1165 [001] DBZ.f 269.955762: sys_rt_sigaction -> 0x0 > ^^^^^ > This is just garbage. Yes, but is is the original garbage, and garbage obtained by acceding to your request. ;-) Perhaps a little bit more constructively, have your conflicting changes hit mainline yet? Thanx, Paul
On Mon, 8 Dec 2025 12:46:32 -0800
"Paul E. McKenney" <paulmck@kernel.org> wrote:
> > bash-1165 [001] DBZ.f 269.955762: sys_rt_sigaction -> 0x0
> > ^^^^^
> > This is just garbage.
>
> Yes, but is is the original garbage, and garbage obtained by acceding
> to your request. ;-)
Yes, here's what it looks like before patch:
bash-1168 [001] ..... 77.498737: sys_wait4 -> 0xfffffffffffffff6
bash-1168 [001] ..... 77.498740: sys_rt_sigreturn()
bash-1168 [001] ..... 77.498765: sys_rt_sigaction(sig: 2, act: 0x7ffef2f08c50, oact: 0x7ffef2f08cf0, sigsetsize: 8)
bash-1168 [001] ..... 77.498770: sys_rt_sigaction -> 0x0
bash-1168 [001] ..... 77.498794: sys_rt_sigprocmask(how: 0, nset: 0, oset: 0x7ffef2f08c70, sigsetsize: 8)
bash-1168 [001] ..... 77.498795: sys_rt_sigprocmask -> 0x0
bash-1168 [001] ..... 77.499055: sys_write(fd: 1, buf: 0x562f0f3aba90 (1b:5d:38:30:30:33:3b:65:6e:64:3d:36:62:62:34:38:38:39:38:2d:32:33:31:33:2d:34:64:38:30:2d:39:30:62:39
:2d:66:33:63:31:36:36:37:62:63:37:37:34:3b:65:78:69:74:3d:73:75:63:63:65:73:73:1b:5c) ".]8003;end=6bb48898-2313-4d80-90b9-f3c1667bc774;exit=success.\", count: 0x3e)
bash-1168 [001] .l... 77.499099: sys_write -> 0x3e
bash-1168 [001] ..... 77.499196: sys_rt_sigprocmask(how: 0, nset: 0, oset: 0x7ffef2f080c0, sigsetsize: 8)
bash-1168 [001] ..... 77.499198: sys_rt_sigprocmask -> 0x0
[ that lone "l" (el not one) is LAZY_NEED_RESCHED ]
After adding the patch:
bash-1212 [005] DBZ.f 72.179808: sys_rt_sigprocmask -> 0x0
bash-1212 [005] DBZ.f 72.179811: sys_ioctl(fd: 0xff, cmd: 0x5401, arg: 0x7ffcaef0e1b0)
bash-1212 [005] DBZ.f 72.179815: sys_ioctl -> 0x0
bash-1212 [005] DBZ.f 72.179818: sys_ioctl(fd: 0xff, cmd: 0x5413, arg: 0x7ffcaef0e1c0)
bash-1212 [005] DBZ.f 72.179823: sys_ioctl -> 0x0
bash-1212 [005] DBZ.f 72.179862: sys_rt_sigprocmask(how: 2, nset: 0x7ffcaef0e290, oset: 0, sigsetsize: 8)
bash-1212 [005] DBZ.f 72.179866: sys_rt_sigprocmask -> 0x0
bash-1212 [005] DBZ.f 72.179884: sys_wait4(upid: 0xffffffffffffffff, stat_addr: 0x7ffcaef0db50, options: 0xb, ru: 0)
bash-1212 [005] DBZ.f 72.179891: sys_wait4 -> 0xfffffffffffffff6
bash-1212 [005] DBZ.f 72.179894: sys_rt_sigreturn()
>
> Perhaps a little bit more constructively, have your conflicting changes
> hit mainline yet?
All my changes have hit mainline.
I believe I did have a solution for fixing the above issue but I was
planning to implement it after the merge window. I can probably do that
today during the keynotes ;-)
-- Steve
On Mon, Dec 08, 2025 at 07:38:49PM -0500, Steven Rostedt wrote: > On Mon, 8 Dec 2025 12:46:32 -0800 > "Paul E. McKenney" <paulmck@kernel.org> wrote: > > > > bash-1165 [001] DBZ.f 269.955762: sys_rt_sigaction -> 0x0 > > > ^^^^^ > > > This is just garbage. > > > > Yes, but is is the original garbage, and garbage obtained by acceding > > to your request. ;-) > > Yes, here's what it looks like before patch: > > bash-1168 [001] ..... 77.498737: sys_wait4 -> 0xfffffffffffffff6 > bash-1168 [001] ..... 77.498740: sys_rt_sigreturn() > bash-1168 [001] ..... 77.498765: sys_rt_sigaction(sig: 2, act: 0x7ffef2f08c50, oact: 0x7ffef2f08cf0, sigsetsize: 8) > bash-1168 [001] ..... 77.498770: sys_rt_sigaction -> 0x0 > bash-1168 [001] ..... 77.498794: sys_rt_sigprocmask(how: 0, nset: 0, oset: 0x7ffef2f08c70, sigsetsize: 8) > bash-1168 [001] ..... 77.498795: sys_rt_sigprocmask -> 0x0 > bash-1168 [001] ..... 77.499055: sys_write(fd: 1, buf: 0x562f0f3aba90 (1b:5d:38:30:30:33:3b:65:6e:64:3d:36:62:62:34:38:38:39:38:2d:32:33:31:33:2d:34:64:38:30:2d:39:30:62:39 > :2d:66:33:63:31:36:36:37:62:63:37:37:34:3b:65:78:69:74:3d:73:75:63:63:65:73:73:1b:5c) ".]8003;end=6bb48898-2313-4d80-90b9-f3c1667bc774;exit=success.\", count: 0x3e) > bash-1168 [001] .l... 77.499099: sys_write -> 0x3e > bash-1168 [001] ..... 77.499196: sys_rt_sigprocmask(how: 0, nset: 0, oset: 0x7ffef2f080c0, sigsetsize: 8) > bash-1168 [001] ..... 77.499198: sys_rt_sigprocmask -> 0x0 > > [ that lone "l" (el not one) is LAZY_NEED_RESCHED ] > > After adding the patch: > > bash-1212 [005] DBZ.f 72.179808: sys_rt_sigprocmask -> 0x0 > bash-1212 [005] DBZ.f 72.179811: sys_ioctl(fd: 0xff, cmd: 0x5401, arg: 0x7ffcaef0e1b0) > bash-1212 [005] DBZ.f 72.179815: sys_ioctl -> 0x0 > bash-1212 [005] DBZ.f 72.179818: sys_ioctl(fd: 0xff, cmd: 0x5413, arg: 0x7ffcaef0e1c0) > bash-1212 [005] DBZ.f 72.179823: sys_ioctl -> 0x0 > bash-1212 [005] DBZ.f 72.179862: sys_rt_sigprocmask(how: 2, nset: 0x7ffcaef0e290, oset: 0, sigsetsize: 8) > bash-1212 [005] DBZ.f 72.179866: sys_rt_sigprocmask -> 0x0 > bash-1212 [005] DBZ.f 72.179884: sys_wait4(upid: 0xffffffffffffffff, stat_addr: 0x7ffcaef0db50, options: 0xb, ru: 0) > bash-1212 [005] DBZ.f 72.179891: sys_wait4 -> 0xfffffffffffffff6 > bash-1212 [005] DBZ.f 72.179894: sys_rt_sigreturn() Understood. > > Perhaps a little bit more constructively, have your conflicting changes > > hit mainline yet? > > All my changes have hit mainline. > > I believe I did have a solution for fixing the above issue but I was > planning to implement it after the merge window. I can probably do that > today during the keynotes ;-) Would it be easiest for me to just hand the patch back to you? I am of course happy to push it myself, but I am also happy to avoid being in the way. Thanx, Paul
On Tue, 9 Dec 2025 14:29:00 -0800 "Paul E. McKenney" <paulmck@kernel.org> wrote: > Would it be easiest for me to just hand the patch back to you? I am of > course happy to push it myself, but I am also happy to avoid being in > the way. Yeah, that may be the best. Thanks, -- Steve
© 2016 - 2025 Red Hat, Inc.