From: Steven Rostedt <rostedt@goodmis.org>
The event trigger data requires a full tracepoint_synchronize_unregister()
call before freeing. That call can take 100s of milliseconds to complete.
In order to allow for bulk freeing of the trigger data, it can not call
the tracepoint_synchronize_unregister() for every individual trigger data
being free.
Create a kthread that gets created the first time a trigger data is freed,
and have it use the lockless llist to get the list of data to free, run
the tracepoint_synchronize_unregister() then free everything in the list.
By freeing hundreds of event_trigger_data elements together, it only
requires two runs of the synchronization function, and not hundreds of
runs. This speeds up the operation by orders of magnitude (milliseconds
instead of several seconds).
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/trace.h | 1 +
kernel/trace/trace_events_trigger.c | 56 +++++++++++++++++++++++++++--
2 files changed, 54 insertions(+), 3 deletions(-)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5863800b1ab3..fd5a6daa6c25 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1808,6 +1808,7 @@ struct event_trigger_data {
char *name;
struct list_head named_list;
struct event_trigger_data *named_data;
+ struct llist_node llist;
};
/* Avoid typos */
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index e5dcfcbb2cd5..16e3449f3cfe 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -6,26 +6,76 @@
*/
#include <linux/security.h>
+#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/rculist.h>
+#include <linux/llist.h>
#include "trace.h"
static LIST_HEAD(trigger_commands);
static DEFINE_MUTEX(trigger_cmd_mutex);
+static struct task_struct *trigger_kthread;
+static struct llist_head trigger_data_free_list;
+static DEFINE_MUTEX(trigger_data_kthread_mutex);
+
+/* Bulk garbage collection of event_trigger_data elements */
+static int trigger_kthread_fn(void *ignore)
+{
+ struct event_trigger_data *data, *tmp;
+ struct llist_node *llnodes;
+
+ /* Once this task starts, it lives forever */
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (llist_empty(&trigger_data_free_list))
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+
+ llnodes = llist_del_all(&trigger_data_free_list);
+
+ /* make sure current triggers exit before free */
+ tracepoint_synchronize_unregister();
+
+ llist_for_each_entry_safe(data, tmp, llnodes, llist)
+ kfree(data);
+ }
+
+ return 0;
+}
+
void trigger_data_free(struct event_trigger_data *data)
{
if (data->cmd_ops->set_filter)
data->cmd_ops->set_filter(NULL, data, NULL);
- /* make sure current triggers exit before free */
- tracepoint_synchronize_unregister();
+ if (unlikely(!trigger_kthread)) {
+ guard(mutex)(&trigger_data_kthread_mutex);
+ /* Check again after taking mutex */
+ if (!trigger_kthread) {
+ struct task_struct *kthread;
+
+ kthread = kthread_create(trigger_kthread_fn, NULL,
+ "trigger_data_free");
+ if (!IS_ERR(kthread))
+ WRITE_ONCE(trigger_kthread, kthread);
+ }
+ }
+
+ if (!trigger_kthread) {
+ /* Do it the slow way */
+ tracepoint_synchronize_unregister();
+ kfree(data);
+ return;
+ }
- kfree(data);
+ llist_add(&data->llist, &trigger_data_free_list);
+ wake_up_process(trigger_kthread);
}
static inline void data_ops_trigger(struct event_trigger_data *data,
--
2.51.0
On Thu, 20 Nov 2025 15:56:02 -0500
Steven Rostedt <rostedt@kernel.org> wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
>
> The event trigger data requires a full tracepoint_synchronize_unregister()
> call before freeing. That call can take 100s of milliseconds to complete.
> In order to allow for bulk freeing of the trigger data, it can not call
> the tracepoint_synchronize_unregister() for every individual trigger data
> being free.
>
> Create a kthread that gets created the first time a trigger data is freed,
> and have it use the lockless llist to get the list of data to free, run
> the tracepoint_synchronize_unregister() then free everything in the list.
>
> By freeing hundreds of event_trigger_data elements together, it only
> requires two runs of the synchronization function, and not hundreds of
> runs. This speeds up the operation by orders of magnitude (milliseconds
> instead of several seconds).
>
I have some nitpicks, but basically looks good to me.
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> kernel/trace/trace.h | 1 +
> kernel/trace/trace_events_trigger.c | 56 +++++++++++++++++++++++++++--
> 2 files changed, 54 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 5863800b1ab3..fd5a6daa6c25 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -1808,6 +1808,7 @@ struct event_trigger_data {
> char *name;
> struct list_head named_list;
> struct event_trigger_data *named_data;
> + struct llist_node llist;
> };
>
> /* Avoid typos */
> diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
> index e5dcfcbb2cd5..16e3449f3cfe 100644
> --- a/kernel/trace/trace_events_trigger.c
> +++ b/kernel/trace/trace_events_trigger.c
> @@ -6,26 +6,76 @@
> */
>
> #include <linux/security.h>
> +#include <linux/kthread.h>
> #include <linux/module.h>
> #include <linux/ctype.h>
> #include <linux/mutex.h>
> #include <linux/slab.h>
> #include <linux/rculist.h>
> +#include <linux/llist.h>
nit: Shouldn't we include this in "trace.h" too, because llist_node is used?
>
> #include "trace.h"
>
> static LIST_HEAD(trigger_commands);
> static DEFINE_MUTEX(trigger_cmd_mutex);
>
> +static struct task_struct *trigger_kthread;
> +static struct llist_head trigger_data_free_list;
> +static DEFINE_MUTEX(trigger_data_kthread_mutex);
> +
> +/* Bulk garbage collection of event_trigger_data elements */
> +static int trigger_kthread_fn(void *ignore)
> +{
> + struct event_trigger_data *data, *tmp;
> + struct llist_node *llnodes;
> +
> + /* Once this task starts, it lives forever */
> + for (;;) {
> + set_current_state(TASK_INTERRUPTIBLE);
> + if (llist_empty(&trigger_data_free_list))
> + schedule();
> +
> + __set_current_state(TASK_RUNNING);
> +
> + llnodes = llist_del_all(&trigger_data_free_list);
> +
> + /* make sure current triggers exit before free */
> + tracepoint_synchronize_unregister();
> +
> + llist_for_each_entry_safe(data, tmp, llnodes, llist)
> + kfree(data);
> + }
> +
> + return 0;
> +}
> +
> void trigger_data_free(struct event_trigger_data *data)
> {
> if (data->cmd_ops->set_filter)
> data->cmd_ops->set_filter(NULL, data, NULL);
>
> - /* make sure current triggers exit before free */
> - tracepoint_synchronize_unregister();
> + if (unlikely(!trigger_kthread)) {
> + guard(mutex)(&trigger_data_kthread_mutex);
> + /* Check again after taking mutex */
> + if (!trigger_kthread) {
> + struct task_struct *kthread;
> +
> + kthread = kthread_create(trigger_kthread_fn, NULL,
> + "trigger_data_free");
> + if (!IS_ERR(kthread))
> + WRITE_ONCE(trigger_kthread, kthread);
> + }
> + }
> +
Hmm,
/* This continues above error case, but we should do it without lock. */
?
> + if (!trigger_kthread) {
> + /* Do it the slow way */
> + tracepoint_synchronize_unregister();
> + kfree(data);
> + return;
> + }
>
> - kfree(data);
> + llist_add(&data->llist, &trigger_data_free_list);
> + wake_up_process(trigger_kthread);
> }
>
> static inline void data_ops_trigger(struct event_trigger_data *data,
> --
> 2.51.0
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Sat, 22 Nov 2025 00:12:06 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > /* Avoid typos */
> > diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
> > index e5dcfcbb2cd5..16e3449f3cfe 100644
> > --- a/kernel/trace/trace_events_trigger.c
> > +++ b/kernel/trace/trace_events_trigger.c
> > @@ -6,26 +6,76 @@
> > */
> >
> > #include <linux/security.h>
> > +#include <linux/kthread.h>
> > #include <linux/module.h>
> > #include <linux/ctype.h>
> > #include <linux/mutex.h>
> > #include <linux/slab.h>
> > #include <linux/rculist.h>
> > +#include <linux/llist.h>
>
> nit: Shouldn't we include this in "trace.h" too, because llist_node is used?
You mean to move it to trace.h instead of here?
>
> >
> > #include "trace.h"
> >
> > static LIST_HEAD(trigger_commands);
> > static DEFINE_MUTEX(trigger_cmd_mutex);
> >
> > +static struct task_struct *trigger_kthread;
> > +static struct llist_head trigger_data_free_list;
> > +static DEFINE_MUTEX(trigger_data_kthread_mutex);
> > +
> > +/* Bulk garbage collection of event_trigger_data elements */
> > +static int trigger_kthread_fn(void *ignore)
> > +{
> > + struct event_trigger_data *data, *tmp;
> > + struct llist_node *llnodes;
> > +
> > + /* Once this task starts, it lives forever */
> > + for (;;) {
> > + set_current_state(TASK_INTERRUPTIBLE);
> > + if (llist_empty(&trigger_data_free_list))
> > + schedule();
> > +
> > + __set_current_state(TASK_RUNNING);
> > +
> > + llnodes = llist_del_all(&trigger_data_free_list);
> > +
> > + /* make sure current triggers exit before free */
> > + tracepoint_synchronize_unregister();
> > +
> > + llist_for_each_entry_safe(data, tmp, llnodes, llist)
> > + kfree(data);
> > + }
> > +
> > + return 0;
> > +}
> > +
> > void trigger_data_free(struct event_trigger_data *data)
> > {
> > if (data->cmd_ops->set_filter)
> > data->cmd_ops->set_filter(NULL, data, NULL);
> >
> > - /* make sure current triggers exit before free */
> > - tracepoint_synchronize_unregister();
> > + if (unlikely(!trigger_kthread)) {
> > + guard(mutex)(&trigger_data_kthread_mutex);
> > + /* Check again after taking mutex */
> > + if (!trigger_kthread) {
> > + struct task_struct *kthread;
> > +
> > + kthread = kthread_create(trigger_kthread_fn, NULL,
> > + "trigger_data_free");
> > + if (!IS_ERR(kthread))
> > + WRITE_ONCE(trigger_kthread, kthread);
> > + }
> > + }
> > +
>
> Hmm,
> /* This continues above error case, but we should do it without lock. */
> ?
Does this really need a comment? The lock is only needed to make sure we
only create one kthread. If that fails, then we do it the slow way. The
code below is unrelated to the lock. The lock is for kthread creation, not
the trigger_kthread variable itself.
-- Steve
>
> > + if (!trigger_kthread) {
> > + /* Do it the slow way */
> > + tracepoint_synchronize_unregister();
> > + kfree(data);
> > + return;
> > + }
> >
> > - kfree(data);
> > + llist_add(&data->llist, &trigger_data_free_list);
> > + wake_up_process(trigger_kthread);
> > }
> >
> > static inline void data_ops_trigger(struct event_trigger_data *data,
> > --
> > 2.51.0
> >
> >
>
>
© 2016 - 2025 Red Hat, Inc.