include/linux/tracepoint.h | 9 +++++++++ kernel/bpf/syscall.c | 23 +++++++++++++++-------- kernel/tracepoint.c | 22 ++++++++++++++++++---- 3 files changed, 42 insertions(+), 12 deletions(-)
The grace period used internally within tracepoint.c:release_probes()
uses call_rcu() to batch waiting for quiescence of old probe arrays,
rather than using the tracepoint_synchronize_unregister() which blocks
while waiting for quiescence.
With the introduction of faultable syscall tracepoints, this causes
use-after-free issues reproduced with syzkaller.
Fix this by introducing tracepoint_call_rcu(), which uses the
appropriate call_rcu() or call_rcu_tasks_trace() before invoking the
rcu_free_old_probes callback.
Use tracepoint_call_rcu() in bpf_link_free() for raw tracepoints as
well, which has the same problem for syscall tracepoints.
Reported-by: syzbot+b390c8062d8387b6272a@syzkaller.appspotmail.com
Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
---
Changes since v0:
- Introduce tracepoint_call_rcu(),
- Fix bpf_link_free() use of call_rcu as well.
---
include/linux/tracepoint.h | 9 +++++++++
kernel/bpf/syscall.c | 23 +++++++++++++++--------
kernel/tracepoint.c | 22 ++++++++++++++++++----
3 files changed, 42 insertions(+), 12 deletions(-)
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 0dc67fad706c..45025d6b2dd6 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -104,6 +104,8 @@ void for_each_tracepoint_in_module(struct module *mod,
* tracepoint_synchronize_unregister must be called between the last tracepoint
* probe unregistration and the end of module exit to make sure there is no
* caller executing a probe when it is freed.
+ * An alternative to tracepoint_synchronize_unregister() is to use
+ * tracepoint_call_rcu() for batched reclaim.
*/
#ifdef CONFIG_TRACEPOINTS
static inline void tracepoint_synchronize_unregister(void)
@@ -111,9 +113,16 @@ static inline void tracepoint_synchronize_unregister(void)
synchronize_rcu_tasks_trace();
synchronize_rcu();
}
+
+void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+ void (*callback)(struct rcu_head *head));
+
#else
static inline void tracepoint_synchronize_unregister(void)
{ }
+static inline void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+ void (*callback)(struct rcu_head *head))
+{ }
#endif
#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 59de664e580d..1191dc1d4206 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3006,14 +3006,21 @@ static void bpf_link_free(struct bpf_link *link)
bpf_prog_put(link->prog);
}
if (ops->dealloc_deferred) {
- /* schedule BPF link deallocation; if underlying BPF program
- * is sleepable, we need to first wait for RCU tasks trace
- * sync, then go through "classic" RCU grace period
- */
- if (sleepable)
- call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
- else
- call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ if (link->type == BPF_LINK_TYPE_RAW_TRACEPOINT) {
+ struct bpf_raw_tp_link *raw_tp =
+ container_of(link, struct bpf_raw_tp_link, link);
+
+ tracepoint_call_rcu(raw_tp->btp->tp, &link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ } else {
+ /* schedule BPF link deallocation; if underlying BPF program
+ * is sleepable, we need to first wait for RCU tasks trace
+ * sync, then go through "classic" RCU grace period
+ */
+ if (sleepable)
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ else
+ call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ }
} else if (ops->dealloc)
ops->dealloc(link);
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6474e2cf22c9..ef60c5484eda 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -106,13 +106,27 @@ static void rcu_free_old_probes(struct rcu_head *head)
kfree(container_of(head, struct tp_probes, rcu));
}
-static inline void release_probes(struct tracepoint_func *old)
+static bool tracepoint_is_syscall(struct tracepoint *tp)
+{
+ return !strcmp(tp->name, "sys_enter") || !strcmp(tp->name, "sys_exit");
+}
+
+void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+ void (*callback)(struct rcu_head *head))
+{
+ if (tracepoint_is_syscall(tp))
+ call_rcu_tasks_trace(head, callback);
+ else
+ call_rcu(head, callback);
+}
+
+static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old)
{
if (old) {
struct tp_probes *tp_probes = container_of(old,
struct tp_probes, probes[0]);
- call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+ tracepoint_call_rcu(tp, &tp_probes->rcu, rcu_free_old_probes);
}
}
@@ -334,7 +348,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
break;
}
- release_probes(old);
+ release_probes(tp, old);
return 0;
}
@@ -406,7 +420,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
WARN_ON_ONCE(1);
break;
}
- release_probes(old);
+ release_probes(tp, old);
return 0;
}
--
2.39.5
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 59de664e580d..1191dc1d4206 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -3006,14 +3006,21 @@ static void bpf_link_free(struct bpf_link *link) > bpf_prog_put(link->prog); I think we would need the same treatment with bpf_prog_put here. Something like, tracepoint_call_rcu(raw_tp->btp->tp, &link->prog->aux->rcu, bpf_link_defer_bpf_prog_put); static void bpf_link_defer_bpf_prog_put(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); bpf_prog_put(aux->prox); } Alternatively, some context would need to be passed down to __bpf_prog_put_noref via the call to bpf_prog_put so it can choose whether or not to use call_rcu or call_rcu_tasks_trace. > -static inline void release_probes(struct tracepoint_func *old) > +static bool tracepoint_is_syscall(struct tracepoint *tp) > +{ > + return !strcmp(tp->name, "sys_enter") || !strcmp(tp->name, "sys_exit"); > +} I'm curious if it might be better to add some field to struct tracepoint like "sleepable" rather than adding a special case here based on the name? Of course, if it's only ever going to be these two cases then maybe adding a new field doesn't make sense. -Jordan
On 2024-10-25 15:08, Jordan Rife wrote: >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index 59de664e580d..1191dc1d4206 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -3006,14 +3006,21 @@ static void bpf_link_free(struct bpf_link *link) >> bpf_prog_put(link->prog); > > I think we would need the same treatment with bpf_prog_put here. > Something like, > > tracepoint_call_rcu(raw_tp->btp->tp, &link->prog->aux->rcu, > bpf_link_defer_bpf_prog_put); > > static void bpf_link_defer_bpf_prog_put(struct rcu_head *rcu) > { > struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); > bpf_prog_put(aux->prox); > } Sure, I'll add this in a v2. > > Alternatively, some context would need to be passed down to > __bpf_prog_put_noref via the call to bpf_prog_put so it can choose > whether or not to use call_rcu or call_rcu_tasks_trace. Also possible, but more cumbersome. > >> -static inline void release_probes(struct tracepoint_func *old) >> +static bool tracepoint_is_syscall(struct tracepoint *tp) >> +{ >> + return !strcmp(tp->name, "sys_enter") || !strcmp(tp->name, "sys_exit"); >> +} > > I'm curious if it might be better to add some field to struct > tracepoint like "sleepable" rather than adding a special case here > based on the name? Of course, if it's only ever going to be these > two cases then maybe adding a new field doesn't make sense. I know Steven is reluctant to bloat the tracepoint struct because there are lots of tracepoint instances (thousands). So for now I thought that just comparing the name would be a good start. We can eventually go a different route as well: introduce a section just to put the syscall tracepoints, and compare the struct tracepoint pointers to the section begin/end range. But it's rather complex for what should remain a simple fix. Thanks, Mathieu > > -Jordan -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On Fri, 25 Oct 2024 15:38:48 -0400 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > > I'm curious if it might be better to add some field to struct > > tracepoint like "sleepable" rather than adding a special case here > > based on the name? Of course, if it's only ever going to be these > > two cases then maybe adding a new field doesn't make sense. > > I know Steven is reluctant to bloat the tracepoint struct because there > are lots of tracepoint instances (thousands). So for now I thought that > just comparing the name would be a good start. You are correct. I really trying to keep the footprint of tracepoints/events down. > > We can eventually go a different route as well: introduce a section just > to put the syscall tracepoints, and compare the struct tracepoint > pointers to the section begin/end range. But it's rather complex > for what should remain a simple fix. A separate section could work. -- Steve
On 2024-10-26 03:13, Steven Rostedt wrote: > On Fri, 25 Oct 2024 15:38:48 -0400 > Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: > >>> I'm curious if it might be better to add some field to struct >>> tracepoint like "sleepable" rather than adding a special case here >>> based on the name? Of course, if it's only ever going to be these >>> two cases then maybe adding a new field doesn't make sense. >> >> I know Steven is reluctant to bloat the tracepoint struct because there >> are lots of tracepoint instances (thousands). So for now I thought that >> just comparing the name would be a good start. > > You are correct. I really trying to keep the footprint of > tracepoints/events down. > >> >> We can eventually go a different route as well: introduce a section just >> to put the syscall tracepoints, and compare the struct tracepoint >> pointers to the section begin/end range. But it's rather complex >> for what should remain a simple fix. > > A separate section could work. I have another approach to suggest: it shrinks the size of struct tracepoint from 80 bytes down to 72 bytes on x86-64, we don't have to do any section/linker script trickery, and it's extensible for future flags: struct static_key { int enabled; void *p; }; struct static_key_false { struct static_key key; }; struct static_call_key { void *func; void *p; }; struct tracepoint { const char *name; /* Tracepoint name */ struct static_key_false key; struct static_call_key *static_call_key; void *static_call_tramp; void *iterator; void *probestub; void *funcs; /* Flags. */ unsigned int regfunc:1, syscall:1; }; struct tracepoint_regfunc { struct tracepoint tp; int (*regfunc)(void); void (*unregfunc)(void); }; Basically, a tracepoint with regfunc would define a struct tracepoint_regfunc rather than a struct tracepoint. So we remove both regfunc and unregfunc NULL pointers in the common case, which gives us plenty of room for flags. When we want to access the regfunc/unregfunc from a struct tracepoint, we check the regfunc flag, and if set, we can use container_of() to get the struct tracepoint_regfunc. Thoughts ? Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
On 2024-10-26 10:25, Mathieu Desnoyers wrote: > On 2024-10-26 03:13, Steven Rostedt wrote: >> On Fri, 25 Oct 2024 15:38:48 -0400 >> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote: >> >>>> I'm curious if it might be better to add some field to struct >>>> tracepoint like "sleepable" rather than adding a special case here >>>> based on the name? Of course, if it's only ever going to be these >>>> two cases then maybe adding a new field doesn't make sense. >>> >>> I know Steven is reluctant to bloat the tracepoint struct because there >>> are lots of tracepoint instances (thousands). So for now I thought that >>> just comparing the name would be a good start. >> >> You are correct. I really trying to keep the footprint of >> tracepoints/events down. >> >>> >>> We can eventually go a different route as well: introduce a section just >>> to put the syscall tracepoints, and compare the struct tracepoint >>> pointers to the section begin/end range. But it's rather complex >>> for what should remain a simple fix. >> >> A separate section could work. > > I have another approach to suggest: it shrinks the > size of struct tracepoint from 80 bytes down to 72 bytes > on x86-64, we don't have to do any section/linker > script trickery, and it's extensible for future flags: > > struct static_key { > int enabled; > void *p; > }; > > struct static_key_false { > struct static_key key; > }; > > struct static_call_key { > void *func; > void *p; > }; > > struct tracepoint { > const char *name; /* Tracepoint name */ > struct static_key_false key; > struct static_call_key *static_call_key; > void *static_call_tramp; > void *iterator; > void *probestub; > void *funcs; > /* Flags. */ > unsigned int regfunc:1, > syscall:1; > }; > > struct tracepoint_regfunc { > struct tracepoint tp; > int (*regfunc)(void); > void (*unregfunc)(void); > }; > > Basically, a tracepoint with regfunc would define a > struct tracepoint_regfunc rather than a struct tracepoint. > So we remove both regfunc and unregfunc NULL pointers in > the common case, which gives us plenty of room for flags. > > When we want to access the regfunc/unregfunc from > a struct tracepoint, we check the regfunc flag, and > if set, we can use container_of() to get the struct > tracepoint_regfunc. Actually I can achieve the same space saving with fewer changes like this: struct tracepoint_ext { void *regfunc; void *unregfunc; /* Flags. */ unsigned int syscall:1; } struct tracepoint { const char *name; /* Tracepoint name */ struct static_key_false key; struct static_call_key *static_call_key; void *static_call_tramp; void *iterator; void *probestub; void *funcs; struct tracepoint_ext *ext; }; Thanks, Mathieu > > Thoughts ? > > Thanks, > > Mathieu > -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
© 2016 - 2024 Red Hat, Inc.