Add a might_fault() check to validate that the perf sys_enter/sys_exit
probe callbacks are indeed called from a context where page faults can
be handled.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
---
include/trace/perf.h | 1 +
kernel/trace/trace_syscalls.c | 2 ++
2 files changed, 3 insertions(+)
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 5650c1bad088..321bfd7919f6 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -84,6 +84,7 @@ perf_trace_##call(void *__data, proto) \
u64 __count __attribute__((unused)); \
struct task_struct *__task __attribute__((unused)); \
\
+ might_fault(); \
guard(preempt_notrace)(); \
do_perf_trace_##call(__data, args); \
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 89d7e4c57b5b..0d42d6f293d6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -602,6 +602,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
+ might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -710,6 +711,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
+ might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
--
2.39.2
On Thu, 3 Oct 2024 11:16:37 -0400 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > Add a might_fault() check to validate that the perf sys_enter/sys_exit > probe callbacks are indeed called from a context where page faults can > be handled. > > Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> > Cc: Michael Jeanson <mjeanson@efficios.com> > Cc: Steven Rostedt <rostedt@goodmis.org> > Cc: Masami Hiramatsu <mhiramat@kernel.org> > Cc: Peter Zijlstra <peterz@infradead.org> > Cc: Alexei Starovoitov <ast@kernel.org> > Cc: Yonghong Song <yhs@fb.com> > Cc: Paul E. McKenney <paulmck@kernel.org> > Cc: Ingo Molnar <mingo@redhat.com> > Cc: Arnaldo Carvalho de Melo <acme@kernel.org> > Cc: Mark Rutland <mark.rutland@arm.com> > Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> > Cc: Namhyung Kim <namhyung@kernel.org> > Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com> > Cc: bpf@vger.kernel.org > Cc: Joel Fernandes <joel@joelfernandes.org> > --- > include/trace/perf.h | 1 + > kernel/trace/trace_syscalls.c | 2 ++ > 2 files changed, 3 insertions(+) > > diff --git a/include/trace/perf.h b/include/trace/perf.h > index 5650c1bad088..321bfd7919f6 100644 > --- a/include/trace/perf.h > +++ b/include/trace/perf.h > @@ -84,6 +84,7 @@ perf_trace_##call(void *__data, proto) \ > u64 __count __attribute__((unused)); \ > struct task_struct *__task __attribute__((unused)); \ > \ > + might_fault(); \ > guard(preempt_notrace)(); \ > do_perf_trace_##call(__data, args); \ Same for this. This is used for all tracepoints that perf hooks to. -- Steve > } > diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c > index 89d7e4c57b5b..0d42d6f293d6 100644 > --- a/kernel/trace/trace_syscalls.c > +++ b/kernel/trace/trace_syscalls.c > @@ -602,6 +602,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) > * Syscall probe called with preemption enabled, but the ring > * buffer and per-cpu data require preemption to be disabled. > */ > + might_fault(); > guard(preempt_notrace)(); > > syscall_nr = trace_get_syscall_nr(current, regs); > @@ -710,6 +711,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) > * Syscall probe called with preemption enabled, but the ring > * buffer and per-cpu data require preemption to be disabled. > */ > + might_fault(); > guard(preempt_notrace)(); > > syscall_nr = trace_get_syscall_nr(current, regs);
On 2024-10-04 00:37, Steven Rostedt wrote:
> On Thu, 3 Oct 2024 11:16:37 -0400
> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
>
>> Add a might_fault() check to validate that the perf sys_enter/sys_exit
>> probe callbacks are indeed called from a context where page faults can
>> be handled.
>>
>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> Cc: Michael Jeanson <mjeanson@efficios.com>
>> Cc: Steven Rostedt <rostedt@goodmis.org>
>> Cc: Masami Hiramatsu <mhiramat@kernel.org>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Alexei Starovoitov <ast@kernel.org>
>> Cc: Yonghong Song <yhs@fb.com>
>> Cc: Paul E. McKenney <paulmck@kernel.org>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
>> Cc: Namhyung Kim <namhyung@kernel.org>
>> Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
>> Cc: bpf@vger.kernel.org
>> Cc: Joel Fernandes <joel@joelfernandes.org>
>> ---
>> include/trace/perf.h | 1 +
>> kernel/trace/trace_syscalls.c | 2 ++
>> 2 files changed, 3 insertions(+)
>>
>> diff --git a/include/trace/perf.h b/include/trace/perf.h
>> index 5650c1bad088..321bfd7919f6 100644
>> --- a/include/trace/perf.h
>> +++ b/include/trace/perf.h
>> @@ -84,6 +84,7 @@ perf_trace_##call(void *__data, proto) \
>> u64 __count __attribute__((unused)); \
>> struct task_struct *__task __attribute__((unused)); \
>> \
>> + might_fault(); \
>> guard(preempt_notrace)(); \
>> do_perf_trace_##call(__data, args); \
>
> Same for this. This is used for all tracepoints that perf hooks to.
You're also missing the context:
#undef DECLARE_EVENT_SYSCALL_CLASS
#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \
__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
PARAMS(assign), PARAMS(print)) \
static notrace void \
perf_trace_##call(void *__data, proto) \
{ \
u64 __count __attribute__((unused)); \
struct task_struct *__task __attribute__((unused)); \
\
might_fault(); \
guard(preempt_notrace)(); \
do_perf_trace_##call(__data, args); \
}
Not an issue.
Thanks,
Mathieu
>
> -- Steve
>
>> }
>> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
>> index 89d7e4c57b5b..0d42d6f293d6 100644
>> --- a/kernel/trace/trace_syscalls.c
>> +++ b/kernel/trace/trace_syscalls.c
>> @@ -602,6 +602,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>> * Syscall probe called with preemption enabled, but the ring
>> * buffer and per-cpu data require preemption to be disabled.
>> */
>> + might_fault();
>> guard(preempt_notrace)();
>>
>> syscall_nr = trace_get_syscall_nr(current, regs);
>> @@ -710,6 +711,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
>> * Syscall probe called with preemption enabled, but the ring
>> * buffer and per-cpu data require preemption to be disabled.
>> */
>> + might_fault();
>> guard(preempt_notrace)();
>>
>> syscall_nr = trace_get_syscall_nr(current, regs);
>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
© 2016 - 2026 Red Hat, Inc.