[for-linus][PATCH] tracing: Drain deferred trigger frees if kthread creation fails

Steven Rostedt posted 1 patch 5 days ago
kernel/trace/trace_events_trigger.c | 79 +++++++++++++++++++++++++++++++------
1 file changed, 66 insertions(+), 13 deletions(-)
[for-linus][PATCH] tracing: Drain deferred trigger frees if kthread creation fails
Posted by Steven Rostedt 5 days ago

tracing fix for 7.0:

- Fix freeing of event triggers in early boot up

  If the same trigger is added on the kernel command line, the second
  one will fail to be applied and the trigger created will be freed.
  This calls into the deferred logic and creates a kernel thread to do
  the freeing. But the command line logic is called before kernel
  threads can be created and this leads to a NULL pointer dereference.

  Delay freeing event triggers until late init.

  git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
trace/fixes

Head SHA1: 250ab25391edeeab8462b68be42e4904506c409c


Wesley Atwell (1):
      tracing: Drain deferred trigger frees if kthread creation fails

----
 kernel/trace/trace_events_trigger.c | 79 +++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 13 deletions(-)
---------------------------
commit 250ab25391edeeab8462b68be42e4904506c409c
Author: Wesley Atwell <atwellwea@gmail.com>
Date:   Tue Mar 24 16:13:26 2026 -0600

    tracing: Drain deferred trigger frees if kthread creation fails
    
    Boot-time trigger registration can fail before the trigger-data cleanup
    kthread exists. Deferring those frees until late init is fine, but the
    post-boot fallback must still drain the deferred list if kthread
    creation never succeeds.
    
    Otherwise, boot-deferred nodes can accumulate on
    trigger_data_free_list, later frees fall back to synchronously freeing
    only the current object, and the older queued entries are leaked
    forever.
    
    To trigger this, add the following to the kernel command line:
    
      trace_event=sched_switch trace_trigger=sched_switch.traceon,sched_switch.traceon
    
    The second traceon trigger will fail and be freed. This triggers a NULL
    pointer dereference and crashes the kernel.
    
    Keep the deferred boot-time behavior, but when kthread creation fails,
    drain the whole queued list synchronously. Do the same in the late-init
    drain path so queued entries are not stranded there either.
    
    Cc: stable@vger.kernel.org
    Link: https://patch.msgid.link/20260324221326.1395799-3-atwellwea@gmail.com
    Fixes: 61d445af0a7c ("tracing: Add bulk garbage collection of freeing event_trigger_data")
    Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
    Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d5230b759a2d..655db2e82513 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread;
 static struct llist_head trigger_data_free_list;
 static DEFINE_MUTEX(trigger_data_kthread_mutex);
 
+static int trigger_kthread_fn(void *ignore);
+
+static void trigger_create_kthread_locked(void)
+{
+	lockdep_assert_held(&trigger_data_kthread_mutex);
+
+	if (!trigger_kthread) {
+		struct task_struct *kthread;
+
+		kthread = kthread_create(trigger_kthread_fn, NULL,
+					 "trigger_data_free");
+		if (!IS_ERR(kthread))
+			WRITE_ONCE(trigger_kthread, kthread);
+	}
+}
+
+static void trigger_data_free_queued_locked(void)
+{
+	struct event_trigger_data *data, *tmp;
+	struct llist_node *llnodes;
+
+	lockdep_assert_held(&trigger_data_kthread_mutex);
+
+	llnodes = llist_del_all(&trigger_data_free_list);
+	if (!llnodes)
+		return;
+
+	tracepoint_synchronize_unregister();
+
+	llist_for_each_entry_safe(data, tmp, llnodes, llist)
+		kfree(data);
+}
+
 /* Bulk garbage collection of event_trigger_data elements */
 static int trigger_kthread_fn(void *ignore)
 {
@@ -56,30 +89,50 @@ void trigger_data_free(struct event_trigger_data *data)
 	if (data->cmd_ops->set_filter)
 		data->cmd_ops->set_filter(NULL, data, NULL);
 
+	/*
+	 * Boot-time trigger registration can fail before kthread creation
+	 * works. Keep the deferred-free semantics during boot and let late
+	 * init start the kthread to drain the list.
+	 */
+	if (system_state == SYSTEM_BOOTING && !trigger_kthread) {
+		llist_add(&data->llist, &trigger_data_free_list);
+		return;
+	}
+
 	if (unlikely(!trigger_kthread)) {
 		guard(mutex)(&trigger_data_kthread_mutex);
+
+		trigger_create_kthread_locked();
 		/* Check again after taking mutex */
 		if (!trigger_kthread) {
-			struct task_struct *kthread;
-
-			kthread = kthread_create(trigger_kthread_fn, NULL,
-						 "trigger_data_free");
-			if (!IS_ERR(kthread))
-				WRITE_ONCE(trigger_kthread, kthread);
+			llist_add(&data->llist, &trigger_data_free_list);
+			/* Drain the queued frees synchronously if creation failed. */
+			trigger_data_free_queued_locked();
+			return;
 		}
 	}
 
-	if (!trigger_kthread) {
-		/* Do it the slow way */
-		tracepoint_synchronize_unregister();
-		kfree(data);
-		return;
-	}
-
 	llist_add(&data->llist, &trigger_data_free_list);
 	wake_up_process(trigger_kthread);
 }
 
+static int __init trigger_data_free_init(void)
+{
+	guard(mutex)(&trigger_data_kthread_mutex);
+
+	if (llist_empty(&trigger_data_free_list))
+		return 0;
+
+	trigger_create_kthread_locked();
+	if (trigger_kthread)
+		wake_up_process(trigger_kthread);
+	else
+		trigger_data_free_queued_locked();
+
+	return 0;
+}
+late_initcall(trigger_data_free_init);
+
 static inline void data_ops_trigger(struct event_trigger_data *data,
 				    struct trace_buffer *buffer,  void *rec,
 				    struct ring_buffer_event *event)