[RFC PATCH v2] tracing: Fix syscall tracepoint use-after-free

Mathieu Desnoyers posted 1 patch 1 month ago
include/linux/tracepoint.h |  9 +++++++++
kernel/bpf/syscall.c       | 36 +++++++++++++++++++++++++++---------
kernel/tracepoint.c        | 22 ++++++++++++++++++----
3 files changed, 54 insertions(+), 13 deletions(-)
[RFC PATCH v2] tracing: Fix syscall tracepoint use-after-free
Posted by Mathieu Desnoyers 1 month ago
The grace period used internally within tracepoint.c:release_probes()
uses call_rcu() to batch waiting for quiescence of old probe arrays,
rather than using the tracepoint_synchronize_unregister() which blocks
while waiting for quiescence.

With the introduction of faultable syscall tracepoints, this causes
use-after-free issues reproduced with syzkaller.

Fix this by introducing tracepoint_call_rcu(), which uses the
appropriate call_rcu() or call_rcu_tasks_trace() before invoking the
rcu_free_old_probes callback.

Use tracepoint_call_rcu() in bpf_link_free() for raw tracepoints as
well, which has the same problem for syscall tracepoints. Ditto for
bpf_prog_put().

Reported-by: syzbot+b390c8062d8387b6272a@syzkaller.appspotmail.com
Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jordan Rife <jrife@google.com>
---
Changes since v0:
- Introduce tracepoint_call_rcu(),
- Fix bpf_link_free() use of call_rcu as well.

Changes since v1:
- Use tracepoint_call_rcu() for bpf_prog_put as well.
---
 include/linux/tracepoint.h |  9 +++++++++
 kernel/bpf/syscall.c       | 36 +++++++++++++++++++++++++++---------
 kernel/tracepoint.c        | 22 ++++++++++++++++++----
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 0dc67fad706c..45025d6b2dd6 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -104,6 +104,8 @@ void for_each_tracepoint_in_module(struct module *mod,
  * tracepoint_synchronize_unregister must be called between the last tracepoint
  * probe unregistration and the end of module exit to make sure there is no
  * caller executing a probe when it is freed.
+ * An alternative to tracepoint_synchronize_unregister() is to use
+ * tracepoint_call_rcu() for batched reclaim.
  */
 #ifdef CONFIG_TRACEPOINTS
 static inline void tracepoint_synchronize_unregister(void)
@@ -111,9 +113,16 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_rcu_tasks_trace();
 	synchronize_rcu();
 }
+
+void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+			 void (*callback)(struct rcu_head *head));
+
 #else
 static inline void tracepoint_synchronize_unregister(void)
 { }
+static inline void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+				       void (*callback)(struct rcu_head *head))
+{ }
 #endif
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 59de664e580d..f21000f33a61 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2992,28 +2992,46 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
 		call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
 }
 
+static void bpf_link_defer_bpf_prog_put(struct rcu_head *rcu)
+{
+	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+	bpf_prog_put(aux->prog);
+}
+
 /* bpf_link_free is guaranteed to be called from process context */
 static void bpf_link_free(struct bpf_link *link)
 {
 	const struct bpf_link_ops *ops = link->ops;
+	struct bpf_raw_tp_link *raw_tp = NULL;
 	bool sleepable = false;
 
+	if (link->type == BPF_LINK_TYPE_RAW_TRACEPOINT)
+		raw_tp = container_of(link, struct bpf_raw_tp_link, link);
 	bpf_link_free_id(link->id);
 	if (link->prog) {
 		sleepable = link->prog->sleepable;
 		/* detach BPF program, clean up used resources */
 		ops->release(link);
-		bpf_prog_put(link->prog);
+		if (raw_tp)
+			tracepoint_call_rcu(raw_tp->btp->tp, &link->prog->aux->rcu,
+					    bpf_link_defer_bpf_prog_put);
+		else
+			bpf_prog_put(link->prog);
 	}
 	if (ops->dealloc_deferred) {
-		/* schedule BPF link deallocation; if underlying BPF program
-		 * is sleepable, we need to first wait for RCU tasks trace
-		 * sync, then go through "classic" RCU grace period
-		 */
-		if (sleepable)
-			call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
-		else
-			call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+		if (raw_tp) {
+			tracepoint_call_rcu(raw_tp->btp->tp, &link->rcu, bpf_link_defer_dealloc_rcu_gp);
+		} else {
+			/* schedule BPF link deallocation; if underlying BPF program
+			 * is sleepable, we need to first wait for RCU tasks trace
+			 * sync, then go through "classic" RCU grace period
+			 */
+			if (sleepable)
+				call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+			else
+				call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+		}
 	} else if (ops->dealloc)
 		ops->dealloc(link);
 }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6474e2cf22c9..ef60c5484eda 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -106,13 +106,27 @@ static void rcu_free_old_probes(struct rcu_head *head)
 	kfree(container_of(head, struct tp_probes, rcu));
 }
 
-static inline void release_probes(struct tracepoint_func *old)
+static bool tracepoint_is_syscall(struct tracepoint *tp)
+{
+	return !strcmp(tp->name, "sys_enter") || !strcmp(tp->name, "sys_exit");
+}
+
+void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+			 void (*callback)(struct rcu_head *head))
+{
+	if (tracepoint_is_syscall(tp))
+		call_rcu_tasks_trace(head, callback);
+	else
+		call_rcu(head, callback);
+}
+
+static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old)
 {
 	if (old) {
 		struct tp_probes *tp_probes = container_of(old,
 			struct tp_probes, probes[0]);
 
-		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+		tracepoint_call_rcu(tp, &tp_probes->rcu, rcu_free_old_probes);
 	}
 }
 
@@ -334,7 +348,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
 		break;
 	}
 
-	release_probes(old);
+	release_probes(tp, old);
 	return 0;
 }
 
@@ -406,7 +420,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 		WARN_ON_ONCE(1);
 		break;
 	}
-	release_probes(old);
+	release_probes(tp, old);
 	return 0;
 }
 
-- 
2.39.5
Re: [RFC PATCH v2] tracing: Fix syscall tracepoint use-after-free
Posted by Andrii Nakryiko 1 month ago
On Fri, Oct 25, 2024 at 12:41 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> The grace period used internally within tracepoint.c:release_probes()
> uses call_rcu() to batch waiting for quiescence of old probe arrays,
> rather than using the tracepoint_synchronize_unregister() which blocks
> while waiting for quiescence.
>
> With the introduction of faultable syscall tracepoints, this causes
> use-after-free issues reproduced with syzkaller.
>
> Fix this by introducing tracepoint_call_rcu(), which uses the
> appropriate call_rcu() or call_rcu_tasks_trace() before invoking the
> rcu_free_old_probes callback.
>
> Use tracepoint_call_rcu() in bpf_link_free() for raw tracepoints as
> well, which has the same problem for syscall tracepoints. Ditto for
> bpf_prog_put().
>
> Reported-by: syzbot+b390c8062d8387b6272a@syzkaller.appspotmail.com
> Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults")
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Michael Jeanson <mjeanson@efficios.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Masami Hiramatsu <mhiramat@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Yonghong Song <yhs@fb.com>
> Cc: Paul E. McKenney <paulmck@kernel.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
> Cc: bpf@vger.kernel.org
> Cc: Joel Fernandes <joel@joelfernandes.org>
> Cc: Jordan Rife <jrife@google.com>
> ---
> Changes since v0:
> - Introduce tracepoint_call_rcu(),
> - Fix bpf_link_free() use of call_rcu as well.
>
> Changes since v1:
> - Use tracepoint_call_rcu() for bpf_prog_put as well.
> ---
>  include/linux/tracepoint.h |  9 +++++++++
>  kernel/bpf/syscall.c       | 36 +++++++++++++++++++++++++++---------
>  kernel/tracepoint.c        | 22 ++++++++++++++++++----
>  3 files changed, 54 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 0dc67fad706c..45025d6b2dd6 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -104,6 +104,8 @@ void for_each_tracepoint_in_module(struct module *mod,
>   * tracepoint_synchronize_unregister must be called between the last tracepoint
>   * probe unregistration and the end of module exit to make sure there is no
>   * caller executing a probe when it is freed.
> + * An alternative to tracepoint_synchronize_unregister() is to use
> + * tracepoint_call_rcu() for batched reclaim.
>   */
>  #ifdef CONFIG_TRACEPOINTS
>  static inline void tracepoint_synchronize_unregister(void)
> @@ -111,9 +113,16 @@ static inline void tracepoint_synchronize_unregister(void)
>         synchronize_rcu_tasks_trace();
>         synchronize_rcu();
>  }
> +
> +void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
> +                        void (*callback)(struct rcu_head *head));
> +
>  #else
>  static inline void tracepoint_synchronize_unregister(void)
>  { }
> +static inline void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
> +                                      void (*callback)(struct rcu_head *head))
> +{ }
>  #endif
>
>  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 59de664e580d..f21000f33a61 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2992,28 +2992,46 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
>                 call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
>  }
>
> +static void bpf_link_defer_bpf_prog_put(struct rcu_head *rcu)
> +{
> +       struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
> +
> +       bpf_prog_put(aux->prog);
> +}
> +
>  /* bpf_link_free is guaranteed to be called from process context */
>  static void bpf_link_free(struct bpf_link *link)
>  {
>         const struct bpf_link_ops *ops = link->ops;
> +       struct bpf_raw_tp_link *raw_tp = NULL;
>         bool sleepable = false;
>
> +       if (link->type == BPF_LINK_TYPE_RAW_TRACEPOINT)
> +               raw_tp = container_of(link, struct bpf_raw_tp_link, link);
>         bpf_link_free_id(link->id);
>         if (link->prog) {
>                 sleepable = link->prog->sleepable;
>                 /* detach BPF program, clean up used resources */
>                 ops->release(link);
> -               bpf_prog_put(link->prog);
> +               if (raw_tp)
> +                       tracepoint_call_rcu(raw_tp->btp->tp, &link->prog->aux->rcu,
> +                                           bpf_link_defer_bpf_prog_put);
> +               else
> +                       bpf_prog_put(link->prog);

it seems like it's problematic to bpf_prog_put() here, probably best
to do it after the link itself goes through RCU grace period. I can
adjust that as well and it will just work.

>         }
>         if (ops->dealloc_deferred) {
> -               /* schedule BPF link deallocation; if underlying BPF program
> -                * is sleepable, we need to first wait for RCU tasks trace
> -                * sync, then go through "classic" RCU grace period
> -                */
> -               if (sleepable)
> -                       call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
> -               else
> -                       call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
> +               if (raw_tp) {
> +                       tracepoint_call_rcu(raw_tp->btp->tp, &link->rcu, bpf_link_defer_dealloc_rcu_gp);

I don't like this. See below, I don't think we should hide sleepable
tracepoint distinction, but also I don't think generic bpf_link_free()
needs to know anything about tracepoint case. Too much abstraction
leaking.

I think the way to go is to generalize sleepable BPF link support in
general and not derive it just from prog->aux->sleepable, but rather
from whether the attachment point is sleepable.

Let me work on this next week at least on BPF side, it's going to be a
bit too much back and forth if you are doing this and trying to guess
what makes sense on BPF side. I'll just need a simple way to detect if
tracepoint target is sleepable (faultable) or not, and the rest we can
handle here, I think.

> +               } else {
> +                       /* schedule BPF link deallocation; if underlying BPF program
> +                        * is sleepable, we need to first wait for RCU tasks trace
> +                        * sync, then go through "classic" RCU grace period
> +                        */
> +                       if (sleepable)
> +                               call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
> +                       else
> +                               call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
> +               }
>         } else if (ops->dealloc)
>                 ops->dealloc(link);
>  }
> diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
> index 6474e2cf22c9..ef60c5484eda 100644
> --- a/kernel/tracepoint.c
> +++ b/kernel/tracepoint.c
> @@ -106,13 +106,27 @@ static void rcu_free_old_probes(struct rcu_head *head)
>         kfree(container_of(head, struct tp_probes, rcu));
>  }
>
> -static inline void release_probes(struct tracepoint_func *old)
> +static bool tracepoint_is_syscall(struct tracepoint *tp)
> +{
> +       return !strcmp(tp->name, "sys_enter") || !strcmp(tp->name, "sys_exit");

Is this really how we know that the tracepoint is sleepable? Based on
its name? Isn't there some flag or something? This is the part I'd
need help with, but hopefully it's not string comparison based.

> +}
> +
> +void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
> +                        void (*callback)(struct rcu_head *head))
> +{
> +       if (tracepoint_is_syscall(tp))
> +               call_rcu_tasks_trace(head, callback);
> +       else
> +               call_rcu(head, callback);
> +}

I'm leaning towards having the logic to handle sleepable tracepoints
in BPF link implementation (for raw tracepoint and maybe for classic
tracepoints as well) directly, instead of abstracting that behind
tracepoint_call_rcu(). We'll need to know whether tracepoint is
sleepable or not anyways, so no need to hide RCU calls, IMO.

pw-bot: cr

> +
> +static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old)
>  {
>         if (old) {
>                 struct tp_probes *tp_probes = container_of(old,
>                         struct tp_probes, probes[0]);
>
> -               call_rcu(&tp_probes->rcu, rcu_free_old_probes);
> +               tracepoint_call_rcu(tp, &tp_probes->rcu, rcu_free_old_probes);
>         }
>  }
>
> @@ -334,7 +348,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
>                 break;
>         }
>
> -       release_probes(old);
> +       release_probes(tp, old);
>         return 0;
>  }
>
> @@ -406,7 +420,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
>                 WARN_ON_ONCE(1);
>                 break;
>         }
> -       release_probes(old);
> +       release_probes(tp, old);
>         return 0;
>  }
>
> --
> 2.39.5
>