Documentation/trace/events.rst | 25 ++++ kernel/trace/trace.c | 11 +- kernel/trace/trace.h | 15 ++- kernel/trace/trace_events.c | 70 +++++----- kernel/trace/trace_events_trigger.c | 199 +++++++++++++++++++++++++++- 5 files changed, 278 insertions(+), 42 deletions(-)
From: Steven Rostedt <rostedt@goodmis.org>
Currently a trigger can only be added to individual events. Some triggers
(like stacktrace) can be useful to add as a bulk trigger for a set of
system events (like interrupt or scheduling).
Add a trigger file to the system directories:
/sys/kernel/tracing/events/*/trigger
And allow stacktrace trigger to be enabled for all those events.
Writing into the system/trigger file acts the same as writing into each of
the system event's trigger files individually.
This also allows to remove a trigger from all events in a subsystem (even
if it's not a subsystem trigger!).
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Note, this is based on top of:
https://patchwork.kernel.org/project/linux-trace-kernel/cover/20251120205600.570673392@kernel.org/
Changes since v1: https://patch.msgid.link/20251120160003.2fa33d80@gandalf.local.home
- Removed unused set variable len (kernel test robot)
- Assign next to strim(buf) to remove beginning spaces
Documentation/trace/events.rst | 25 ++++
kernel/trace/trace.c | 11 +-
kernel/trace/trace.h | 15 ++-
kernel/trace/trace_events.c | 70 +++++-----
kernel/trace/trace_events_trigger.c | 199 +++++++++++++++++++++++++++-
5 files changed, 278 insertions(+), 42 deletions(-)
diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
index 18d112963dec..caa4958af43a 100644
--- a/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@ -416,6 +416,31 @@ way, so beware about making generalizations between the two.
can also enable triggers that are written into
/sys/kernel/tracing/events/ftrace/print/trigger
+The system directory also has a trigger file that allows some triggers to be
+set for all the system's events. This is limited to only a small subset of the
+triggers and does not allow for the count parameter. But it does allow for
+filters. Writing into this file is the same as writing into each of the
+system's event's trigger files individually. Although only a subset of
+triggers may use this file for enabling, all triggers may use this file for
+disabling::
+
+ cd /sys/kernel/tracing
+ cat events/sched/trigger
+ # Available system triggers:
+ # stacktrace
+
+ echo stacktrace > events/sched/trigger
+ cat events/sched/sched_switch/trigger
+ stacktrace:unlimited
+
+ echo snapshot > events/sched/sched_waking/trigger
+ cat events/sched/sched_waking/trigger
+ snapshot:unlimited
+ echo '!snapshot' > events/sched/trigger
+ cat events/sched/sched_waking/trigger
+ # Available triggers:
+ # traceon traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist
+
6.1 Expression syntax
---------------------
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 032bdedca5d9..feced9f43156 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -592,11 +592,12 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
LIST_HEAD(ftrace_trace_arrays);
-int trace_array_get(struct trace_array *this_tr)
+int __trace_array_get(struct trace_array *this_tr)
{
struct trace_array *tr;
- guard(mutex)(&trace_types_lock);
+ lockdep_assert_held(&trace_types_lock);
+
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr == this_tr) {
tr->ref++;
@@ -607,6 +608,12 @@ int trace_array_get(struct trace_array *this_tr)
return -ENODEV;
}
+int trace_array_get(struct trace_array *tr)
+{
+ guard(mutex)(&trace_types_lock);
+ return __trace_array_get(tr);
+}
+
static void __trace_array_put(struct trace_array *this_tr)
{
WARN_ON(!this_tr->ref);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd5a6daa6c25..7379763a057d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -469,10 +469,14 @@ extern struct list_head ftrace_trace_arrays;
extern struct mutex trace_types_lock;
extern int trace_array_get(struct trace_array *tr);
+extern int __trace_array_get(struct trace_array *tr);
extern int tracing_check_open_get_tr(struct trace_array *tr);
extern struct trace_array *trace_array_find(const char *instance);
extern struct trace_array *trace_array_find_get(const char *instance);
+extern struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode);
+void trace_put_system_dir(struct trace_subsystem_dir *dir);
+
extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
@@ -1774,6 +1778,7 @@ static inline struct trace_event_file *event_file_file(struct file *filp)
}
extern const struct file_operations event_trigger_fops;
+extern const struct file_operations event_system_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
extern const struct file_operations event_inject_fops;
@@ -2057,10 +2062,16 @@ struct event_command {
* regardless of whether or not it has a filter associated with
* it (filters make a trigger require access to the trace record
* but are not always present).
+ *
+ * @SYSTEM: A flag that says whether or not this command can be used
+ * at the event system level. For example, can it be written into
+ * events/sched/trigger file where it will be enabled for all
+ * sched events?
*/
enum event_command_flags {
- EVENT_CMD_FL_POST_TRIGGER = 1,
- EVENT_CMD_FL_NEEDS_REC = 2,
+ EVENT_CMD_FL_POST_TRIGGER = BIT(1),
+ EVENT_CMD_FL_NEEDS_REC = BIT(2),
+ EVENT_CMD_FL_SYSTEM = BIT(3),
};
static inline bool event_command_post_trigger(struct event_command *cmd_ops)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9b07ad9eb284..f00b41f73fc2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2168,51 +2168,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
static LIST_HEAD(event_subsystems);
-static int subsystem_open(struct inode *inode, struct file *filp)
+struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode)
{
- struct trace_subsystem_dir *dir = NULL, *iter_dir;
- struct trace_array *tr = NULL, *iter_tr;
- struct event_subsystem *system = NULL;
- int ret;
+ struct trace_subsystem_dir *dir;
+ struct trace_array *tr = NULL;
- if (tracing_is_disabled())
- return -ENODEV;
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
/* Make sure the system still exists */
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
- list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
- list_for_each_entry(iter_dir, &iter_tr->systems, list) {
- if (iter_dir == inode->i_private) {
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ list_for_each_entry(dir, &tr->systems, list) {
+ if (dir == inode->i_private) {
/* Don't open systems with no events */
- tr = iter_tr;
- dir = iter_dir;
- if (dir->nr_events) {
- __get_system_dir(dir);
- system = dir->subsystem;
- }
- goto exit_loop;
+ if (!dir->nr_events)
+ return NULL;
+ if (__trace_array_get(tr) < 0)
+ return NULL;
+ __get_system_dir(dir);
+ return dir;
}
}
}
- exit_loop:
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
+ return NULL;
+}
- if (!system)
+void trace_put_system_dir(struct trace_subsystem_dir *dir)
+{
+ trace_array_put(dir->tr);
+ put_system(dir);
+}
+
+static int subsystem_open(struct inode *inode, struct file *filp)
+{
+ struct trace_subsystem_dir *dir;
+ int ret;
+
+ if (tracing_is_disabled())
return -ENODEV;
- /* Still need to increment the ref count of the system */
- if (trace_array_get(tr) < 0) {
- put_system(dir);
+ dir = trace_get_system_dir(inode);
+ if (!dir)
return -ENODEV;
- }
ret = tracing_open_generic(inode, filp);
- if (ret < 0) {
- trace_array_put(tr);
- put_system(dir);
- }
+ if (ret < 0)
+ trace_put_system_dir(dir);
return ret;
}
@@ -2761,6 +2762,9 @@ static int system_callback(const char *name, umode_t *mode, void **data,
else if (strcmp(name, "enable") == 0)
*fops = &ftrace_system_enable_fops;
+ else if (strcmp(name, "trigger") == 0)
+ *fops = &event_system_trigger_fops;
+
else
return 0;
@@ -2784,6 +2788,10 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
{
.name = "enable",
.callback = system_callback,
+ },
+ {
+ .name = "trigger",
+ .callback = system_callback,
}
};
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 1dfe69146a81..9249770679f3 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -329,21 +329,28 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
return -EINVAL;
}
+static char *get_user_buf(const char __user *ubuf, size_t cnt)
+{
+ if (!cnt)
+ return NULL;
+
+ if (cnt >= PAGE_SIZE)
+ return ERR_PTR(-EINVAL);
+
+ return memdup_user_nul(ubuf, cnt);
+}
+
static ssize_t event_trigger_regex_write(struct file *file,
const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_event_file *event_file;
ssize_t ret;
- char *buf __free(kfree) = NULL;
+ char *buf __free(kfree) = get_user_buf(ubuf, cnt);
- if (!cnt)
+ if (!buf)
return 0;
- if (cnt >= PAGE_SIZE)
- return -EINVAL;
-
- buf = memdup_user_nul(ubuf, cnt);
if (IS_ERR(buf))
return PTR_ERR(buf);
@@ -397,6 +404,184 @@ const struct file_operations event_trigger_fops = {
.release = event_trigger_release,
};
+static ssize_t
+event_system_trigger_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ char *buf __free(kfree) = kmalloc(SZ_4K, GFP_KERNEL);
+ struct event_command *p;
+ struct seq_buf s;
+ int len;
+
+ if (!buf)
+ return -ENOMEM;
+
+ seq_buf_init(&s, buf, SZ_4K);
+
+ seq_buf_puts(&s, "# Available system triggers:\n");
+ seq_buf_putc(&s, '#');
+
+ guard(mutex)(&trigger_cmd_mutex);
+ list_for_each_entry_reverse(p, &trigger_commands, list) {
+ if (p->flags & EVENT_CMD_FL_SYSTEM)
+ seq_buf_printf(&s, " %s", p->name);
+ }
+ seq_buf_putc(&s, '\n');
+
+ len = seq_buf_used(&s);
+
+ if (*ppos >= len)
+ return 0;
+
+ len -= *ppos;
+
+ if (count > len)
+ count = len;
+
+ if (copy_to_user(ubuf, buf + *ppos, count))
+ return -EFAULT;
+
+ *ppos += count;
+
+ return count;
+}
+
+static int process_system_events(struct trace_subsystem_dir *dir,
+ struct event_command *p, char *buff,
+ char *command, char *next)
+{
+ struct event_subsystem *system = dir->subsystem;
+ struct trace_event_file *file;
+ struct trace_array *tr = dir->tr;
+ bool remove = false;
+ int ret = 0;
+
+ if (buff[0] == '!')
+ remove = true;
+
+ lockdep_assert_held(&event_mutex);
+
+ list_for_each_entry(file, &tr->events, list) {
+
+ if (strcmp(system->name, file->event_call->class->system) != 0)
+ continue;
+
+ ret = p->parse(p, file, buff, command, next);
+
+ /* Removals and existing events do not error */
+ if (ret < 0 && ret != -EEXIST && !remove) {
+ pr_warn("Failed adding trigger %s on %s\n",
+ command, trace_event_name(file->event_call));
+ }
+ }
+ return 0;
+}
+
+static ssize_t
+event_system_trigger_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_subsystem_dir *dir = filp->private_data;
+ struct event_command *p;
+ char *command, *next;
+ char *buf __free(kfree) = get_user_buf(ubuf, cnt);
+ bool remove = false;
+ bool found = false;
+ ssize_t ret;
+
+ if (!buf)
+ return 0;
+
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ /* system triggers are not allowed to have counters */
+ if (strchr(buf, ':'))
+ return -EINVAL;
+
+ /* If opened for read too, dir is in the seq_file descriptor */
+ if (filp->f_mode & FMODE_READ) {
+ struct seq_file *m = filp->private_data;
+ dir = m->private;
+ }
+
+ /* Skip added space at beginning of buf */
+ next = strim(buf);
+
+ command = strsep(&next, " \t");
+ if (next) {
+ next = skip_spaces(next);
+ if (!*next)
+ next = NULL;
+ }
+ if (command[0] == '!') {
+ remove = true;
+ command++;
+ }
+
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
+ list_for_each_entry(p, &trigger_commands, list) {
+ /* Allow to remove any trigger */
+ if (!remove && !(p->flags & EVENT_CMD_FL_SYSTEM))
+ continue;
+ if (strcmp(p->name, command) == 0) {
+ found = true;
+ ret = process_system_events(dir, p, buf, command, next);
+ break;
+ }
+ }
+
+ if (!found)
+ ret = -ENODEV;
+
+ if (!ret)
+ *ppos += cnt;
+
+ if (remove || ret < 0)
+ return ret ? : cnt;
+
+ return cnt;
+}
+
+static int
+event_system_trigger_open(struct inode *inode, struct file *file)
+{
+ struct trace_subsystem_dir *dir;
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
+ dir = trace_get_system_dir(inode);
+ if (!dir)
+ return -ENODEV;
+
+ file->private_data = dir;
+
+ return ret;
+}
+
+static int
+event_system_trigger_release(struct inode *inode, struct file *file)
+{
+ struct trace_subsystem_dir *dir = inode->i_private;
+
+ trace_put_system_dir(dir);
+
+ return 0;
+}
+
+const struct file_operations event_system_trigger_fops = {
+ .open = event_system_trigger_open,
+ .read = event_system_trigger_read,
+ .write = event_system_trigger_write,
+ .llseek = tracing_lseek,
+ .release = event_system_trigger_release,
+};
+
/*
* Currently we only register event commands from __init, so mark this
* __init too.
@@ -1587,7 +1772,7 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
static struct event_command trigger_stacktrace_cmd = {
.name = "stacktrace",
.trigger_type = ETT_STACKTRACE,
- .flags = EVENT_CMD_FL_POST_TRIGGER,
+ .flags = EVENT_CMD_FL_POST_TRIGGER | EVENT_CMD_FL_SYSTEM,
.parse = event_trigger_parse,
.reg = register_trigger,
.unreg = unregister_trigger,
--
2.51.0
On Tue, 25 Nov 2025 20:08:37 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
>
> Currently a trigger can only be added to individual events. Some triggers
> (like stacktrace) can be useful to add as a bulk trigger for a set of
> system events (like interrupt or scheduling).
>
> Add a trigger file to the system directories:
>
> /sys/kernel/tracing/events/*/trigger
>
> And allow stacktrace trigger to be enabled for all those events.
>
> Writing into the system/trigger file acts the same as writing into each of
> the system event's trigger files individually.
>
> This also allows to remove a trigger from all events in a subsystem (even
> if it's not a subsystem trigger!).
>
I have some comments below.
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Note, this is based on top of:
>
> https://patchwork.kernel.org/project/linux-trace-kernel/cover/20251120205600.570673392@kernel.org/
>
> Changes since v1: https://patch.msgid.link/20251120160003.2fa33d80@gandalf.local.home
>
> - Removed unused set variable len (kernel test robot)
>
> - Assign next to strim(buf) to remove beginning spaces
>
> Documentation/trace/events.rst | 25 ++++
> kernel/trace/trace.c | 11 +-
> kernel/trace/trace.h | 15 ++-
> kernel/trace/trace_events.c | 70 +++++-----
> kernel/trace/trace_events_trigger.c | 199 +++++++++++++++++++++++++++-
> 5 files changed, 278 insertions(+), 42 deletions(-)
>
> diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
> index 18d112963dec..caa4958af43a 100644
> --- a/Documentation/trace/events.rst
> +++ b/Documentation/trace/events.rst
> @@ -416,6 +416,31 @@ way, so beware about making generalizations between the two.
> can also enable triggers that are written into
> /sys/kernel/tracing/events/ftrace/print/trigger
>
> +The system directory also has a trigger file that allows some triggers to be
> +set for all the system's events. This is limited to only a small subset of the
> +triggers and does not allow for the count parameter. But it does allow for
> +filters. Writing into this file is the same as writing into each of the
> +system's event's trigger files individually. Although only a subset of
> +triggers may use this file for enabling, all triggers may use this file for
> +disabling::
> +
> + cd /sys/kernel/tracing
> + cat events/sched/trigger
> + # Available system triggers:
> + # stacktrace
> +
> + echo stacktrace > events/sched/trigger
> + cat events/sched/sched_switch/trigger
> + stacktrace:unlimited
> +
> + echo snapshot > events/sched/sched_waking/trigger
> + cat events/sched/sched_waking/trigger
> + snapshot:unlimited
> + echo '!snapshot' > events/sched/trigger
> + cat events/sched/sched_waking/trigger
> + # Available triggers:
> + # traceon traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist
> +
> 6.1 Expression syntax
> ---------------------
>
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 032bdedca5d9..feced9f43156 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -592,11 +592,12 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
>
> LIST_HEAD(ftrace_trace_arrays);
>
> -int trace_array_get(struct trace_array *this_tr)
> +int __trace_array_get(struct trace_array *this_tr)
> {
> struct trace_array *tr;
>
> - guard(mutex)(&trace_types_lock);
> + lockdep_assert_held(&trace_types_lock);
> +
> list_for_each_entry(tr, &ftrace_trace_arrays, list) {
> if (tr == this_tr) {
> tr->ref++;
> @@ -607,6 +608,12 @@ int trace_array_get(struct trace_array *this_tr)
> return -ENODEV;
> }
>
> +int trace_array_get(struct trace_array *tr)
> +{
> + guard(mutex)(&trace_types_lock);
> + return __trace_array_get(tr);
> +}
> +
> static void __trace_array_put(struct trace_array *this_tr)
> {
> WARN_ON(!this_tr->ref);
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index fd5a6daa6c25..7379763a057d 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -469,10 +469,14 @@ extern struct list_head ftrace_trace_arrays;
> extern struct mutex trace_types_lock;
>
> extern int trace_array_get(struct trace_array *tr);
> +extern int __trace_array_get(struct trace_array *tr);
> extern int tracing_check_open_get_tr(struct trace_array *tr);
> extern struct trace_array *trace_array_find(const char *instance);
> extern struct trace_array *trace_array_find_get(const char *instance);
>
> +extern struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode);
> +void trace_put_system_dir(struct trace_subsystem_dir *dir);
> +
> extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
> extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
> extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
> @@ -1774,6 +1778,7 @@ static inline struct trace_event_file *event_file_file(struct file *filp)
> }
>
> extern const struct file_operations event_trigger_fops;
> +extern const struct file_operations event_system_trigger_fops;
> extern const struct file_operations event_hist_fops;
> extern const struct file_operations event_hist_debug_fops;
> extern const struct file_operations event_inject_fops;
> @@ -2057,10 +2062,16 @@ struct event_command {
> * regardless of whether or not it has a filter associated with
> * it (filters make a trigger require access to the trace record
> * but are not always present).
> + *
> + * @SYSTEM: A flag that says whether or not this command can be used
> + * at the event system level. For example, can it be written into
> + * events/sched/trigger file where it will be enabled for all
> + * sched events?
> */
> enum event_command_flags {
> - EVENT_CMD_FL_POST_TRIGGER = 1,
> - EVENT_CMD_FL_NEEDS_REC = 2,
> + EVENT_CMD_FL_POST_TRIGGER = BIT(1),
> + EVENT_CMD_FL_NEEDS_REC = BIT(2),
> + EVENT_CMD_FL_SYSTEM = BIT(3),
> };
>
> static inline bool event_command_post_trigger(struct event_command *cmd_ops)
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 9b07ad9eb284..f00b41f73fc2 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -2168,51 +2168,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
>
> static LIST_HEAD(event_subsystems);
>
> -static int subsystem_open(struct inode *inode, struct file *filp)
> +struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode)
> {
> - struct trace_subsystem_dir *dir = NULL, *iter_dir;
> - struct trace_array *tr = NULL, *iter_tr;
> - struct event_subsystem *system = NULL;
> - int ret;
> + struct trace_subsystem_dir *dir;
> + struct trace_array *tr = NULL;
nit: This also no need to be initialized.
>
> - if (tracing_is_disabled())
> - return -ENODEV;
> + guard(mutex)(&event_mutex);
> + guard(mutex)(&trace_types_lock);
>
> /* Make sure the system still exists */
> - mutex_lock(&event_mutex);
> - mutex_lock(&trace_types_lock);
> - list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
> - list_for_each_entry(iter_dir, &iter_tr->systems, list) {
> - if (iter_dir == inode->i_private) {
> + list_for_each_entry(tr, &ftrace_trace_arrays, list) {
> + list_for_each_entry(dir, &tr->systems, list) {
> + if (dir == inode->i_private) {
> /* Don't open systems with no events */
> - tr = iter_tr;
> - dir = iter_dir;
> - if (dir->nr_events) {
> - __get_system_dir(dir);
> - system = dir->subsystem;
> - }
> - goto exit_loop;
> + if (!dir->nr_events)
> + return NULL;
> + if (__trace_array_get(tr) < 0)
> + return NULL;
> + __get_system_dir(dir);
> + return dir;
> }
> }
> }
> - exit_loop:
> - mutex_unlock(&trace_types_lock);
> - mutex_unlock(&event_mutex);
> + return NULL;
> +}
>
> - if (!system)
> +void trace_put_system_dir(struct trace_subsystem_dir *dir)
> +{
> + trace_array_put(dir->tr);
> + put_system(dir);
> +}
> +
> +static int subsystem_open(struct inode *inode, struct file *filp)
> +{
> + struct trace_subsystem_dir *dir;
> + int ret;
> +
> + if (tracing_is_disabled())
> return -ENODEV;
>
> - /* Still need to increment the ref count of the system */
> - if (trace_array_get(tr) < 0) {
> - put_system(dir);
> + dir = trace_get_system_dir(inode);
> + if (!dir)
> return -ENODEV;
> - }
>
> ret = tracing_open_generic(inode, filp);
> - if (ret < 0) {
> - trace_array_put(tr);
> - put_system(dir);
> - }
> + if (ret < 0)
> + trace_put_system_dir(dir);
>
> return ret;
> }
> @@ -2761,6 +2762,9 @@ static int system_callback(const char *name, umode_t *mode, void **data,
> else if (strcmp(name, "enable") == 0)
> *fops = &ftrace_system_enable_fops;
>
> + else if (strcmp(name, "trigger") == 0)
> + *fops = &event_system_trigger_fops;
> +
> else
> return 0;
>
> @@ -2784,6 +2788,10 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
> {
> .name = "enable",
> .callback = system_callback,
> + },
> + {
> + .name = "trigger",
> + .callback = system_callback,
> }
> };
>
> diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
> index 1dfe69146a81..9249770679f3 100644
> --- a/kernel/trace/trace_events_trigger.c
> +++ b/kernel/trace/trace_events_trigger.c
> @@ -329,21 +329,28 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
> return -EINVAL;
> }
>
> +static char *get_user_buf(const char __user *ubuf, size_t cnt)
> +{
> + if (!cnt)
> + return NULL;
> +
> + if (cnt >= PAGE_SIZE)
> + return ERR_PTR(-EINVAL);
> +
> + return memdup_user_nul(ubuf, cnt);
> +}
> +
> static ssize_t event_trigger_regex_write(struct file *file,
> const char __user *ubuf,
> size_t cnt, loff_t *ppos)
> {
> struct trace_event_file *event_file;
> ssize_t ret;
> - char *buf __free(kfree) = NULL;
> + char *buf __free(kfree) = get_user_buf(ubuf, cnt);
>
> - if (!cnt)
> + if (!buf)
> return 0;
>
> - if (cnt >= PAGE_SIZE)
> - return -EINVAL;
> -
> - buf = memdup_user_nul(ubuf, cnt);
> if (IS_ERR(buf))
> return PTR_ERR(buf);
You can simply write:
if (IS_ERR_OR_NULL(buf))
return PTR_ERR_OR_ZERO(buf);
>
> @@ -397,6 +404,184 @@ const struct file_operations event_trigger_fops = {
> .release = event_trigger_release,
> };
>
> +static ssize_t
> +event_system_trigger_read(struct file *filp, char __user *ubuf,
> + size_t count, loff_t *ppos)
> +{
> + char *buf __free(kfree) = kmalloc(SZ_4K, GFP_KERNEL);
> + struct event_command *p;
> + struct seq_buf s;
> + int len;
> +
> + if (!buf)
> + return -ENOMEM;
> +
> + seq_buf_init(&s, buf, SZ_4K);
> +
> + seq_buf_puts(&s, "# Available system triggers:\n");
> + seq_buf_putc(&s, '#');
> +
> + guard(mutex)(&trigger_cmd_mutex);
> + list_for_each_entry_reverse(p, &trigger_commands, list) {
> + if (p->flags & EVENT_CMD_FL_SYSTEM)
> + seq_buf_printf(&s, " %s", p->name);
> + }
> + seq_buf_putc(&s, '\n');
> +
> + len = seq_buf_used(&s);
> +
> + if (*ppos >= len)
> + return 0;
> +
> + len -= *ppos;
> +
> + if (count > len)
> + count = len;
> +
> + if (copy_to_user(ubuf, buf + *ppos, count))
> + return -EFAULT;
> +
> + *ppos += count;
> +
> + return count;
> +}
> +
> +static int process_system_events(struct trace_subsystem_dir *dir,
> + struct event_command *p, char *buff,
> + char *command, char *next)
> +{
> + struct event_subsystem *system = dir->subsystem;
> + struct trace_event_file *file;
> + struct trace_array *tr = dir->tr;
> + bool remove = false;
> + int ret = 0;
> +
> + if (buff[0] == '!')
> + remove = true;
> +
> + lockdep_assert_held(&event_mutex);
> +
> + list_for_each_entry(file, &tr->events, list) {
> +
> + if (strcmp(system->name, file->event_call->class->system) != 0)
> + continue;
> +
> + ret = p->parse(p, file, buff, command, next);
> +
> + /* Removals and existing events do not error */
> + if (ret < 0 && ret != -EEXIST && !remove) {
> + pr_warn("Failed adding trigger %s on %s\n",
> + command, trace_event_name(file->event_call));
> + }
Can I expect that this can recover the previous settings
via event trigger?
e.g.
# echo "stacktrace" > events/sched/sched_wakeup/trigger
# echo "stacktrace" > events/sched/trigger
# echo "!stacktrace" > events/sched/trigger
# cat events/sched/sched_wakeup/trigger
stacktrace:unlimited
?
> + }
> + return 0;
> +}
> +
> +static ssize_t
> +event_system_trigger_write(struct file *filp, const char __user *ubuf,
> + size_t cnt, loff_t *ppos)
> +{
> + struct trace_subsystem_dir *dir = filp->private_data;
> + struct event_command *p;
> + char *command, *next;
> + char *buf __free(kfree) = get_user_buf(ubuf, cnt);
> + bool remove = false;
> + bool found = false;
> + ssize_t ret;
> +
> + if (!buf)
> + return 0;
> +
> + if (IS_ERR(buf))
> + return PTR_ERR(buf);
Ditto.
> +
> + /* system triggers are not allowed to have counters */
> + if (strchr(buf, ':'))
> + return -EINVAL;
':' is not always used for counters (e.g. hist) and it seems odd
to check anything about parse here. Can we do this counter check
after parse a command?
> +
> + /* If opened for read too, dir is in the seq_file descriptor */
> + if (filp->f_mode & FMODE_READ) {
> + struct seq_file *m = filp->private_data;
> + dir = m->private;
> + }
> +
> + /* Skip added space at beginning of buf */
> + next = strim(buf);
> +
> + command = strsep(&next, " \t");
> + if (next) {
> + next = skip_spaces(next);
> + if (!*next)
> + next = NULL;
> + }
strim() removes both leading and trailing whitespace. So this check is
not required.
Thanks,
> + if (command[0] == '!') {
> + remove = true;
> + command++;
> + }
> +
> + guard(mutex)(&event_mutex);
> + guard(mutex)(&trigger_cmd_mutex);
> +
> + list_for_each_entry(p, &trigger_commands, list) {
> + /* Allow to remove any trigger */
> + if (!remove && !(p->flags & EVENT_CMD_FL_SYSTEM))
> + continue;
> + if (strcmp(p->name, command) == 0) {
> + found = true;
> + ret = process_system_events(dir, p, buf, command, next);
> + break;
> + }
> + }
> +
> + if (!found)
> + ret = -ENODEV;
> +
> + if (!ret)
> + *ppos += cnt;
> +
> + if (remove || ret < 0)
> + return ret ? : cnt;
> +
> + return cnt;
> +}
> +
> +static int
> +event_system_trigger_open(struct inode *inode, struct file *file)
> +{
> + struct trace_subsystem_dir *dir;
> + int ret;
> +
> + ret = security_locked_down(LOCKDOWN_TRACEFS);
> + if (ret)
> + return ret;
> +
> + dir = trace_get_system_dir(inode);
> + if (!dir)
> + return -ENODEV;
> +
> + file->private_data = dir;
> +
> + return ret;
> +}
> +
> +static int
> +event_system_trigger_release(struct inode *inode, struct file *file)
> +{
> + struct trace_subsystem_dir *dir = inode->i_private;
> +
> + trace_put_system_dir(dir);
> +
> + return 0;
> +}
> +
> +const struct file_operations event_system_trigger_fops = {
> + .open = event_system_trigger_open,
> + .read = event_system_trigger_read,
> + .write = event_system_trigger_write,
> + .llseek = tracing_lseek,
> + .release = event_system_trigger_release,
> +};
> +
> /*
> * Currently we only register event commands from __init, so mark this
> * __init too.
> @@ -1587,7 +1772,7 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
> static struct event_command trigger_stacktrace_cmd = {
> .name = "stacktrace",
> .trigger_type = ETT_STACKTRACE,
> - .flags = EVENT_CMD_FL_POST_TRIGGER,
> + .flags = EVENT_CMD_FL_POST_TRIGGER | EVENT_CMD_FL_SYSTEM,
> .parse = event_trigger_parse,
> .reg = register_trigger,
> .unreg = unregister_trigger,
> --
> 2.51.0
>
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Wed, 26 Nov 2025 15:02:51 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > This also allows to remove a trigger from all events in a subsystem (even
> > if it's not a subsystem trigger!).
> >
>
> I have some comments below.
BTW, it's more appropriate to simply trim the email ;-)
> > --- a/kernel/trace/trace_events.c
> > +++ b/kernel/trace/trace_events.c
> > @@ -2168,51 +2168,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
> >
> > static LIST_HEAD(event_subsystems);
> >
> > -static int subsystem_open(struct inode *inode, struct file *filp)
> > +struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode)
> > {
> > - struct trace_subsystem_dir *dir = NULL, *iter_dir;
> > - struct trace_array *tr = NULL, *iter_tr;
> > - struct event_subsystem *system = NULL;
> > - int ret;
> > + struct trace_subsystem_dir *dir;
> > + struct trace_array *tr = NULL;
>
> nit: This also no need to be initialized.
Hmm, I guess this was needed in one of the versions I had before posting.
I'll fix in v3.
>
> >
> > - if (tracing_is_disabled())
> > - return -ENODEV;
> > + guard(mutex)(&event_mutex);
> > + guard(mutex)(&trace_types_lock);
> >
> > /* Make sure the system still exists */
> > - mutex_lock(&event_mutex);
> > - mutex_lock(&trace_types_lock);
> > - list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
> > - list_for_each_entry(iter_dir, &iter_tr->systems, list) {
> > - if (iter_dir == inode->i_private) {
> > + list_for_each_entry(tr, &ftrace_trace_arrays, list) {
> > + list_for_each_entry(dir, &tr->systems, list) {
> > + if (dir == inode->i_private) {
> > /* Don't open systems with no events */
> > - tr = iter_tr;
> > - dir = iter_dir;
> > - if (dir->nr_events) {
> > - __get_system_dir(dir);
> > - system = dir->subsystem;
> > - }
> > - goto exit_loop;
> > + if (!dir->nr_events)
> > + return NULL;
> > + if (__trace_array_get(tr) < 0)
> > + return NULL;
> > + __get_system_dir(dir);
> > + return dir;
> > }
> > }
> > }
> > static ssize_t event_trigger_regex_write(struct file *file,
> > const char __user *ubuf,
> > size_t cnt, loff_t *ppos)
> > {
> > struct trace_event_file *event_file;
> > ssize_t ret;
> > - char *buf __free(kfree) = NULL;
> > + char *buf __free(kfree) = get_user_buf(ubuf, cnt);
> >
> > - if (!cnt)
> > + if (!buf)
> > return 0;
> >
> > - if (cnt >= PAGE_SIZE)
> > - return -EINVAL;
> > -
> > - buf = memdup_user_nul(ubuf, cnt);
> > if (IS_ERR(buf))
> > return PTR_ERR(buf);
>
> You can simply write:
>
> if (IS_ERR_OR_NULL(buf))
> return PTR_ERR_OR_ZERO(buf);
Yes I can. But honestly, the above is much harder to understand what is
happening than the code I had written.
I mean:
if (!buf)
return 0;
if (IS_ERR(buf))
return PTR_ERR(buf);
is pretty obvious of what is happening.
if (IS_ERR_OR_NULL(buf))
return PTR_ERR_OR_ZERO(buf);
Is quite a bit more obfuscated. I mean, I needed to look up what
PTR_ERR_OR_ZERO() did to be sure I knew what was returned.
> > + list_for_each_entry(file, &tr->events, list) {
> > +
> > + if (strcmp(system->name, file->event_call->class->system) != 0)
> > + continue;
> > +
> > + ret = p->parse(p, file, buff, command, next);
> > +
> > + /* Removals and existing events do not error */
> > + if (ret < 0 && ret != -EEXIST && !remove) {
> > + pr_warn("Failed adding trigger %s on %s\n",
> > + command, trace_event_name(file->event_call));
> > + }
>
>
> Can I expect that this can recover the previous settings
> via event trigger?
> e.g.
>
> # echo "stacktrace" > events/sched/sched_wakeup/trigger
> # echo "stacktrace" > events/sched/trigger
> # echo "!stacktrace" > events/sched/trigger
> # cat events/sched/sched_wakeup/trigger
> stacktrace:unlimited
>
> ?
No. In fact, this is one of the features of the system trigger. Writing
into the system/trigger file is the same as writing into each of the
system's event's trigger files one at a time. In fact, I updated the
documentation in this patch to show that this file can be used to clear
tiggers too!
+ echo snapshot > events/sched/sched_waking/trigger
+ cat events/sched/sched_waking/trigger
+ snapshot:unlimited
+ echo '!snapshot' > events/sched/trigger
+ cat events/sched/sched_waking/trigger
+ # Available triggers:
+ # traceon traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist
>
>
> > + }
> > + return 0;
> > +}
> > +
> > +static ssize_t
> > +event_system_trigger_write(struct file *filp, const char __user *ubuf,
> > + size_t cnt, loff_t *ppos)
> > +{
> > + struct trace_subsystem_dir *dir = filp->private_data;
> > + struct event_command *p;
> > + char *command, *next;
> > + char *buf __free(kfree) = get_user_buf(ubuf, cnt);
> > + bool remove = false;
> > + bool found = false;
> > + ssize_t ret;
> > +
> > + if (!buf)
> > + return 0;
> > +
> > + if (IS_ERR(buf))
> > + return PTR_ERR(buf);
>
> Ditto.
And ditto again with my reply ;-)
>
> > +
> > + /* system triggers are not allowed to have counters */
> > + if (strchr(buf, ':'))
> > + return -EINVAL;
>
> ':' is not always used for counters (e.g. hist) and it seems odd
> to check anything about parse here. Can we do this counter check
> after parse a command?
>
> > +
> > + /* If opened for read too, dir is in the seq_file descriptor */
> > + if (filp->f_mode & FMODE_READ) {
> > + struct seq_file *m = filp->private_data;
> > + dir = m->private;
> > + }
> > +
> > + /* Skip added space at beginning of buf */
> > + next = strim(buf);
> > +
> > + command = strsep(&next, " \t");
> > + if (next) {
> > + next = skip_spaces(next);
> > + if (!*next)
> > + next = NULL;
> > + }
>
> strim() removes both leading and trailing whitespace. So this check is
> not required.
But next here is not the one that had strim() attached to it.
command = strsep(&next, " \t");
Updates the content of next.
Thanks for the review,
-- Steve
© 2016 - 2025 Red Hat, Inc.