[v2] tracing: Add system trigger file to enable triggers for all the system's events

[PATCH v2] tracing: Add system trigger file to enable triggers for all the system's events

Posted by Steven Rostedt 2 months, 2 weeks ago

From: Steven Rostedt <rostedt@goodmis.org>

Currently a trigger can only be added to individual events. Some triggers
(like stacktrace) can be useful to add as a bulk trigger for a set of
system events (like interrupt or scheduling).

Add a trigger file to the system directories:

   /sys/kernel/tracing/events/*/trigger

And allow stacktrace trigger to be enabled for all those events.

Writing into the system/trigger file acts the same as writing into each of
the system event's trigger files individually.

This also allows to remove a trigger from all events in a subsystem (even
if it's not a subsystem trigger!).

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Note, this is based on top of:

  https://patchwork.kernel.org/project/linux-trace-kernel/cover/20251120205600.570673392@kernel.org/

Changes since v1: https://patch.msgid.link/20251120160003.2fa33d80@gandalf.local.home

- Removed unused set variable len (kernel test robot)

- Assign next to strim(buf) to remove beginning spaces

 Documentation/trace/events.rst      |  25 ++++
 kernel/trace/trace.c                |  11 +-
 kernel/trace/trace.h                |  15 ++-
 kernel/trace/trace_events.c         |  70 +++++-----
 kernel/trace/trace_events_trigger.c | 199 +++++++++++++++++++++++++++-
 5 files changed, 278 insertions(+), 42 deletions(-)

diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
index 18d112963dec..caa4958af43a 100644
--- a/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@ -416,6 +416,31 @@ way, so beware about making generalizations between the two.
      can also enable triggers that are written into
      /sys/kernel/tracing/events/ftrace/print/trigger
 
+The system directory also has a trigger file that allows some triggers to be
+set for all the system's events. This is limited to only a small subset of the
+triggers and does not allow for the count parameter. But it does allow for
+filters. Writing into this file is the same as writing into each of the
+system's event's trigger files individually. Although only a subset of
+triggers may use this file for enabling, all triggers may use this file for
+disabling::
+
+	cd /sys/kernel/tracing
+	cat events/sched/trigger
+	# Available system triggers:
+	# stacktrace
+
+	echo stacktrace > events/sched/trigger
+	cat events/sched/sched_switch/trigger
+	stacktrace:unlimited
+
+	echo snapshot > events/sched/sched_waking/trigger
+	cat events/sched/sched_waking/trigger
+	snapshot:unlimited
+	echo '!snapshot' > events/sched/trigger
+	cat events/sched/sched_waking/trigger
+	# Available triggers:
+	# traceon traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist
+
 6.1 Expression syntax
 ---------------------
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 032bdedca5d9..feced9f43156 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -592,11 +592,12 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
 
 LIST_HEAD(ftrace_trace_arrays);
 
-int trace_array_get(struct trace_array *this_tr)
+int __trace_array_get(struct trace_array *this_tr)
 {
 	struct trace_array *tr;
 
-	guard(mutex)(&trace_types_lock);
+	lockdep_assert_held(&trace_types_lock);
+
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		if (tr == this_tr) {
 			tr->ref++;
@@ -607,6 +608,12 @@ int trace_array_get(struct trace_array *this_tr)
 	return -ENODEV;
 }
 
+int trace_array_get(struct trace_array *tr)
+{
+	guard(mutex)(&trace_types_lock);
+	return __trace_array_get(tr);
+}
+
 static void __trace_array_put(struct trace_array *this_tr)
 {
 	WARN_ON(!this_tr->ref);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd5a6daa6c25..7379763a057d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -469,10 +469,14 @@ extern struct list_head ftrace_trace_arrays;
 extern struct mutex trace_types_lock;
 
 extern int trace_array_get(struct trace_array *tr);
+extern int __trace_array_get(struct trace_array *tr);
 extern int tracing_check_open_get_tr(struct trace_array *tr);
 extern struct trace_array *trace_array_find(const char *instance);
 extern struct trace_array *trace_array_find_get(const char *instance);
 
+extern struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode);
+void trace_put_system_dir(struct trace_subsystem_dir *dir);
+
 extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
 extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
 extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
@@ -1774,6 +1778,7 @@ static inline struct trace_event_file *event_file_file(struct file *filp)
 }
 
 extern const struct file_operations event_trigger_fops;
+extern const struct file_operations event_system_trigger_fops;
 extern const struct file_operations event_hist_fops;
 extern const struct file_operations event_hist_debug_fops;
 extern const struct file_operations event_inject_fops;
@@ -2057,10 +2062,16 @@ struct event_command {
  *	regardless of whether or not it has a filter associated with
  *	it (filters make a trigger require access to the trace record
  *	but are not always present).
+ *
+ * @SYSTEM: A flag that says whether or not this command can be used
+ *	at the event system level. For example, can it be written into
+ *	events/sched/trigger file where it will be enabled for all
+ *	sched events?
  */
 enum event_command_flags {
-	EVENT_CMD_FL_POST_TRIGGER	= 1,
-	EVENT_CMD_FL_NEEDS_REC		= 2,
+	EVENT_CMD_FL_POST_TRIGGER	= BIT(1),
+	EVENT_CMD_FL_NEEDS_REC		= BIT(2),
+	EVENT_CMD_FL_SYSTEM		= BIT(3),
 };
 
 static inline bool event_command_post_trigger(struct event_command *cmd_ops)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9b07ad9eb284..f00b41f73fc2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2168,51 +2168,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 static LIST_HEAD(event_subsystems);
 
-static int subsystem_open(struct inode *inode, struct file *filp)
+struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode)
 {
-	struct trace_subsystem_dir *dir = NULL, *iter_dir;
-	struct trace_array *tr = NULL, *iter_tr;
-	struct event_subsystem *system = NULL;
-	int ret;
+	struct trace_subsystem_dir *dir;
+	struct trace_array *tr = NULL;
 
-	if (tracing_is_disabled())
-		return -ENODEV;
+	guard(mutex)(&event_mutex);
+	guard(mutex)(&trace_types_lock);
 
 	/* Make sure the system still exists */
-	mutex_lock(&event_mutex);
-	mutex_lock(&trace_types_lock);
-	list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
-		list_for_each_entry(iter_dir, &iter_tr->systems, list) {
-			if (iter_dir == inode->i_private) {
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		list_for_each_entry(dir, &tr->systems, list) {
+			if (dir == inode->i_private) {
 				/* Don't open systems with no events */
-				tr = iter_tr;
-				dir = iter_dir;
-				if (dir->nr_events) {
-					__get_system_dir(dir);
-					system = dir->subsystem;
-				}
-				goto exit_loop;
+				if (!dir->nr_events)
+					return NULL;
+				if (__trace_array_get(tr) < 0)
+					return NULL;
+				__get_system_dir(dir);
+				return dir;
 			}
 		}
 	}
- exit_loop:
-	mutex_unlock(&trace_types_lock);
-	mutex_unlock(&event_mutex);
+	return NULL;
+}
 
-	if (!system)
+void trace_put_system_dir(struct trace_subsystem_dir *dir)
+{
+	trace_array_put(dir->tr);
+	put_system(dir);
+}
+
+static int subsystem_open(struct inode *inode, struct file *filp)
+{
+	struct trace_subsystem_dir *dir;
+	int ret;
+
+	if (tracing_is_disabled())
 		return -ENODEV;
 
-	/* Still need to increment the ref count of the system */
-	if (trace_array_get(tr) < 0) {
-		put_system(dir);
+	dir = trace_get_system_dir(inode);
+	if (!dir)
 		return -ENODEV;
-	}
 
 	ret = tracing_open_generic(inode, filp);
-	if (ret < 0) {
-		trace_array_put(tr);
-		put_system(dir);
-	}
+	if (ret < 0)
+		trace_put_system_dir(dir);
 
 	return ret;
 }
@@ -2761,6 +2762,9 @@ static int system_callback(const char *name, umode_t *mode, void **data,
 	else if (strcmp(name, "enable") == 0)
 		*fops = &ftrace_system_enable_fops;
 
+	else if (strcmp(name, "trigger") == 0)
+		*fops = &event_system_trigger_fops;
+
 	else
 		return 0;
 
@@ -2784,6 +2788,10 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 		{
 			.name		= "enable",
 			.callback	= system_callback,
+		},
+		{
+			.name		= "trigger",
+			.callback	= system_callback,
 		}
 	};
 
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 1dfe69146a81..9249770679f3 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -329,21 +329,28 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
 	return -EINVAL;
 }
 
+static char *get_user_buf(const char __user *ubuf, size_t cnt)
+{
+	if (!cnt)
+		return NULL;
+
+	if (cnt >= PAGE_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	return memdup_user_nul(ubuf, cnt);
+}
+
 static ssize_t event_trigger_regex_write(struct file *file,
 					 const char __user *ubuf,
 					 size_t cnt, loff_t *ppos)
 {
 	struct trace_event_file *event_file;
 	ssize_t ret;
-	char *buf __free(kfree) = NULL;
+	char *buf __free(kfree) = get_user_buf(ubuf, cnt);
 
-	if (!cnt)
+	if (!buf)
 		return 0;
 
-	if (cnt >= PAGE_SIZE)
-		return -EINVAL;
-
-	buf = memdup_user_nul(ubuf, cnt);
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);
 
@@ -397,6 +404,184 @@ const struct file_operations event_trigger_fops = {
 	.release = event_trigger_release,
 };
 
+static ssize_t
+event_system_trigger_read(struct file *filp, char __user *ubuf,
+			  size_t count, loff_t *ppos)
+{
+	char *buf __free(kfree) = kmalloc(SZ_4K, GFP_KERNEL);
+	struct event_command *p;
+	struct seq_buf s;
+	int len;
+
+	if (!buf)
+		return -ENOMEM;
+
+	seq_buf_init(&s, buf, SZ_4K);
+
+	seq_buf_puts(&s, "# Available system triggers:\n");
+	seq_buf_putc(&s, '#');
+
+	guard(mutex)(&trigger_cmd_mutex);
+	list_for_each_entry_reverse(p, &trigger_commands, list) {
+		if (p->flags & EVENT_CMD_FL_SYSTEM)
+			seq_buf_printf(&s, " %s", p->name);
+	}
+	seq_buf_putc(&s, '\n');
+
+	len = seq_buf_used(&s);
+
+	if (*ppos >= len)
+		return 0;
+
+	len -= *ppos;
+
+	if (count > len)
+		count = len;
+
+	if (copy_to_user(ubuf, buf + *ppos, count))
+		return -EFAULT;
+
+	*ppos += count;
+
+	return count;
+}
+
+static int process_system_events(struct trace_subsystem_dir *dir,
+				 struct event_command *p, char *buff,
+				 char *command, char *next)
+{
+	struct event_subsystem *system = dir->subsystem;
+	struct trace_event_file *file;
+	struct trace_array *tr = dir->tr;
+	bool remove = false;
+	int ret = 0;
+
+	if (buff[0] == '!')
+		remove = true;
+
+	lockdep_assert_held(&event_mutex);
+
+	list_for_each_entry(file, &tr->events, list) {
+
+		if (strcmp(system->name, file->event_call->class->system) != 0)
+			continue;
+
+		ret = p->parse(p, file, buff, command, next);
+
+		/* Removals and existing events do not error */
+		if (ret < 0 && ret != -EEXIST && !remove) {
+			pr_warn("Failed adding trigger %s on %s\n",
+				command, trace_event_name(file->event_call));
+		}
+	}
+	return 0;
+}
+
+static ssize_t
+event_system_trigger_write(struct file *filp, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	struct trace_subsystem_dir *dir = filp->private_data;
+	struct event_command *p;
+	char *command, *next;
+	char *buf __free(kfree) = get_user_buf(ubuf, cnt);
+	bool remove = false;
+	bool found = false;
+	ssize_t ret;
+
+	if (!buf)
+		return 0;
+
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	/* system triggers are not allowed to have counters */
+	if (strchr(buf, ':'))
+		return -EINVAL;
+
+	/* If opened for read too, dir is in the seq_file descriptor */
+	if (filp->f_mode & FMODE_READ) {
+		struct seq_file *m = filp->private_data;
+		dir = m->private;
+	}
+
+	/* Skip added space at beginning of buf */
+	next = strim(buf);
+
+	command = strsep(&next, " \t");
+	if (next) {
+		next = skip_spaces(next);
+		if (!*next)
+			next = NULL;
+	}
+	if (command[0] == '!') {
+		remove = true;
+		command++;
+	}
+
+	guard(mutex)(&event_mutex);
+	guard(mutex)(&trigger_cmd_mutex);
+
+	list_for_each_entry(p, &trigger_commands, list) {
+		/* Allow to remove any trigger */
+		if (!remove && !(p->flags & EVENT_CMD_FL_SYSTEM))
+			continue;
+		if (strcmp(p->name, command) == 0) {
+			found = true;
+			ret = process_system_events(dir, p, buf, command, next);
+			break;
+		}
+	}
+
+	if (!found)
+		ret = -ENODEV;
+
+	if (!ret)
+		*ppos += cnt;
+
+	if (remove || ret < 0)
+		return ret ? : cnt;
+
+	return cnt;
+}
+
+static int
+event_system_trigger_open(struct inode *inode, struct file *file)
+{
+	struct trace_subsystem_dir *dir;
+	int ret;
+
+	ret = security_locked_down(LOCKDOWN_TRACEFS);
+	if (ret)
+		return ret;
+
+	dir = trace_get_system_dir(inode);
+	if (!dir)
+		return -ENODEV;
+
+	file->private_data = dir;
+
+	return ret;
+}
+
+static int
+event_system_trigger_release(struct inode *inode, struct file *file)
+{
+	struct trace_subsystem_dir *dir = inode->i_private;
+
+	trace_put_system_dir(dir);
+
+	return 0;
+}
+
+const struct file_operations event_system_trigger_fops = {
+	.open = event_system_trigger_open,
+	.read = event_system_trigger_read,
+	.write = event_system_trigger_write,
+	.llseek = tracing_lseek,
+	.release = event_system_trigger_release,
+};
+
 /*
  * Currently we only register event commands from __init, so mark this
  * __init too.
@@ -1587,7 +1772,7 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
 static struct event_command trigger_stacktrace_cmd = {
 	.name			= "stacktrace",
 	.trigger_type		= ETT_STACKTRACE,
-	.flags			= EVENT_CMD_FL_POST_TRIGGER,
+	.flags			= EVENT_CMD_FL_POST_TRIGGER | EVENT_CMD_FL_SYSTEM,
 	.parse			= event_trigger_parse,
 	.reg			= register_trigger,
 	.unreg			= unregister_trigger,
-- 
2.51.0

Re: [PATCH v2] tracing: Add system trigger file to enable triggers for all the system's events

Posted by Masami Hiramatsu (Google) 2 months, 2 weeks ago

On Tue, 25 Nov 2025 20:08:37 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> From: Steven Rostedt <rostedt@goodmis.org>
> 
> Currently a trigger can only be added to individual events. Some triggers
> (like stacktrace) can be useful to add as a bulk trigger for a set of
> system events (like interrupt or scheduling).
> 
> Add a trigger file to the system directories:
> 
>    /sys/kernel/tracing/events/*/trigger
> 
> And allow stacktrace trigger to be enabled for all those events.
> 
> Writing into the system/trigger file acts the same as writing into each of
> the system event's trigger files individually.
> 
> This also allows to remove a trigger from all events in a subsystem (even
> if it's not a subsystem trigger!).
> 

I have some comments below.

> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Note, this is based on top of:
> 
>   https://patchwork.kernel.org/project/linux-trace-kernel/cover/20251120205600.570673392@kernel.org/
> 
> Changes since v1: https://patch.msgid.link/20251120160003.2fa33d80@gandalf.local.home
> 
> - Removed unused set variable len (kernel test robot)
> 
> - Assign next to strim(buf) to remove beginning spaces
> 
>  Documentation/trace/events.rst      |  25 ++++
>  kernel/trace/trace.c                |  11 +-
>  kernel/trace/trace.h                |  15 ++-
>  kernel/trace/trace_events.c         |  70 +++++-----
>  kernel/trace/trace_events_trigger.c | 199 +++++++++++++++++++++++++++-
>  5 files changed, 278 insertions(+), 42 deletions(-)
> 
> diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
> index 18d112963dec..caa4958af43a 100644
> --- a/Documentation/trace/events.rst
> +++ b/Documentation/trace/events.rst
> @@ -416,6 +416,31 @@ way, so beware about making generalizations between the two.
>       can also enable triggers that are written into
>       /sys/kernel/tracing/events/ftrace/print/trigger
>  
> +The system directory also has a trigger file that allows some triggers to be
> +set for all the system's events. This is limited to only a small subset of the
> +triggers and does not allow for the count parameter. But it does allow for
> +filters. Writing into this file is the same as writing into each of the
> +system's event's trigger files individually. Although only a subset of
> +triggers may use this file for enabling, all triggers may use this file for
> +disabling::
> +
> +	cd /sys/kernel/tracing
> +	cat events/sched/trigger
> +	# Available system triggers:
> +	# stacktrace
> +
> +	echo stacktrace > events/sched/trigger
> +	cat events/sched/sched_switch/trigger
> +	stacktrace:unlimited
> +
> +	echo snapshot > events/sched/sched_waking/trigger
> +	cat events/sched/sched_waking/trigger
> +	snapshot:unlimited
> +	echo '!snapshot' > events/sched/trigger
> +	cat events/sched/sched_waking/trigger
> +	# Available triggers:
> +	# traceon traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist
> +
>  6.1 Expression syntax
>  ---------------------
>  
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 032bdedca5d9..feced9f43156 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -592,11 +592,12 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
>  
>  LIST_HEAD(ftrace_trace_arrays);
>  
> -int trace_array_get(struct trace_array *this_tr)
> +int __trace_array_get(struct trace_array *this_tr)
>  {
>  	struct trace_array *tr;
>  
> -	guard(mutex)(&trace_types_lock);
> +	lockdep_assert_held(&trace_types_lock);
> +
>  	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
>  		if (tr == this_tr) {
>  			tr->ref++;
> @@ -607,6 +608,12 @@ int trace_array_get(struct trace_array *this_tr)
>  	return -ENODEV;
>  }
>  
> +int trace_array_get(struct trace_array *tr)
> +{
> +	guard(mutex)(&trace_types_lock);
> +	return __trace_array_get(tr);
> +}
> +
>  static void __trace_array_put(struct trace_array *this_tr)
>  {
>  	WARN_ON(!this_tr->ref);
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index fd5a6daa6c25..7379763a057d 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -469,10 +469,14 @@ extern struct list_head ftrace_trace_arrays;
>  extern struct mutex trace_types_lock;
>  
>  extern int trace_array_get(struct trace_array *tr);
> +extern int __trace_array_get(struct trace_array *tr);
>  extern int tracing_check_open_get_tr(struct trace_array *tr);
>  extern struct trace_array *trace_array_find(const char *instance);
>  extern struct trace_array *trace_array_find_get(const char *instance);
>  
> +extern struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode);
> +void trace_put_system_dir(struct trace_subsystem_dir *dir);
> +
>  extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe);
>  extern int tracing_set_filter_buffering(struct trace_array *tr, bool set);
>  extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
> @@ -1774,6 +1778,7 @@ static inline struct trace_event_file *event_file_file(struct file *filp)
>  }
>  
>  extern const struct file_operations event_trigger_fops;
> +extern const struct file_operations event_system_trigger_fops;
>  extern const struct file_operations event_hist_fops;
>  extern const struct file_operations event_hist_debug_fops;
>  extern const struct file_operations event_inject_fops;
> @@ -2057,10 +2062,16 @@ struct event_command {
>   *	regardless of whether or not it has a filter associated with
>   *	it (filters make a trigger require access to the trace record
>   *	but are not always present).
> + *
> + * @SYSTEM: A flag that says whether or not this command can be used
> + *	at the event system level. For example, can it be written into
> + *	events/sched/trigger file where it will be enabled for all
> + *	sched events?
>   */
>  enum event_command_flags {
> -	EVENT_CMD_FL_POST_TRIGGER	= 1,
> -	EVENT_CMD_FL_NEEDS_REC		= 2,
> +	EVENT_CMD_FL_POST_TRIGGER	= BIT(1),
> +	EVENT_CMD_FL_NEEDS_REC		= BIT(2),
> +	EVENT_CMD_FL_SYSTEM		= BIT(3),
>  };
>  
>  static inline bool event_command_post_trigger(struct event_command *cmd_ops)
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 9b07ad9eb284..f00b41f73fc2 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -2168,51 +2168,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
>  
>  static LIST_HEAD(event_subsystems);
>  
> -static int subsystem_open(struct inode *inode, struct file *filp)
> +struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode)
>  {
> -	struct trace_subsystem_dir *dir = NULL, *iter_dir;
> -	struct trace_array *tr = NULL, *iter_tr;
> -	struct event_subsystem *system = NULL;
> -	int ret;
> +	struct trace_subsystem_dir *dir;
> +	struct trace_array *tr = NULL;

nit: This also no need to be initialized.

>  
> -	if (tracing_is_disabled())
> -		return -ENODEV;
> +	guard(mutex)(&event_mutex);
> +	guard(mutex)(&trace_types_lock);
>  
>  	/* Make sure the system still exists */
> -	mutex_lock(&event_mutex);
> -	mutex_lock(&trace_types_lock);
> -	list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
> -		list_for_each_entry(iter_dir, &iter_tr->systems, list) {
> -			if (iter_dir == inode->i_private) {
> +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
> +		list_for_each_entry(dir, &tr->systems, list) {
> +			if (dir == inode->i_private) {
>  				/* Don't open systems with no events */
> -				tr = iter_tr;
> -				dir = iter_dir;
> -				if (dir->nr_events) {
> -					__get_system_dir(dir);
> -					system = dir->subsystem;
> -				}
> -				goto exit_loop;
> +				if (!dir->nr_events)
> +					return NULL;
> +				if (__trace_array_get(tr) < 0)
> +					return NULL;
> +				__get_system_dir(dir);
> +				return dir;
>  			}
>  		}
>  	}
> - exit_loop:
> -	mutex_unlock(&trace_types_lock);
> -	mutex_unlock(&event_mutex);
> +	return NULL;
> +}
>  
> -	if (!system)
> +void trace_put_system_dir(struct trace_subsystem_dir *dir)
> +{
> +	trace_array_put(dir->tr);
> +	put_system(dir);
> +}
> +
> +static int subsystem_open(struct inode *inode, struct file *filp)
> +{
> +	struct trace_subsystem_dir *dir;
> +	int ret;
> +
> +	if (tracing_is_disabled())
>  		return -ENODEV;
>  
> -	/* Still need to increment the ref count of the system */
> -	if (trace_array_get(tr) < 0) {
> -		put_system(dir);
> +	dir = trace_get_system_dir(inode);
> +	if (!dir)
>  		return -ENODEV;
> -	}
>  
>  	ret = tracing_open_generic(inode, filp);
> -	if (ret < 0) {
> -		trace_array_put(tr);
> -		put_system(dir);
> -	}
> +	if (ret < 0)
> +		trace_put_system_dir(dir);
>  
>  	return ret;
>  }
> @@ -2761,6 +2762,9 @@ static int system_callback(const char *name, umode_t *mode, void **data,
>  	else if (strcmp(name, "enable") == 0)
>  		*fops = &ftrace_system_enable_fops;
>  
> +	else if (strcmp(name, "trigger") == 0)
> +		*fops = &event_system_trigger_fops;
> +
>  	else
>  		return 0;
>  
> @@ -2784,6 +2788,10 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
>  		{
>  			.name		= "enable",
>  			.callback	= system_callback,
> +		},
> +		{
> +			.name		= "trigger",
> +			.callback	= system_callback,
>  		}
>  	};
>  
> diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
> index 1dfe69146a81..9249770679f3 100644
> --- a/kernel/trace/trace_events_trigger.c
> +++ b/kernel/trace/trace_events_trigger.c
> @@ -329,21 +329,28 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
>  	return -EINVAL;
>  }
>  
> +static char *get_user_buf(const char __user *ubuf, size_t cnt)
> +{
> +	if (!cnt)
> +		return NULL;
> +
> +	if (cnt >= PAGE_SIZE)
> +		return ERR_PTR(-EINVAL);
> +
> +	return memdup_user_nul(ubuf, cnt);
> +}
> +
>  static ssize_t event_trigger_regex_write(struct file *file,
>  					 const char __user *ubuf,
>  					 size_t cnt, loff_t *ppos)
>  {
>  	struct trace_event_file *event_file;
>  	ssize_t ret;
> -	char *buf __free(kfree) = NULL;
> +	char *buf __free(kfree) = get_user_buf(ubuf, cnt);
>  
> -	if (!cnt)
> +	if (!buf)
>  		return 0;
>  
> -	if (cnt >= PAGE_SIZE)
> -		return -EINVAL;
> -
> -	buf = memdup_user_nul(ubuf, cnt);
>  	if (IS_ERR(buf))
>  		return PTR_ERR(buf);

You can simply write:

	if (IS_ERR_OR_NULL(buf))
		return PTR_ERR_OR_ZERO(buf);


>  
> @@ -397,6 +404,184 @@ const struct file_operations event_trigger_fops = {
>  	.release = event_trigger_release,
>  };
>  
> +static ssize_t
> +event_system_trigger_read(struct file *filp, char __user *ubuf,
> +			  size_t count, loff_t *ppos)
> +{
> +	char *buf __free(kfree) = kmalloc(SZ_4K, GFP_KERNEL);
> +	struct event_command *p;
> +	struct seq_buf s;
> +	int len;
> +
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	seq_buf_init(&s, buf, SZ_4K);
> +
> +	seq_buf_puts(&s, "# Available system triggers:\n");
> +	seq_buf_putc(&s, '#');
> +
> +	guard(mutex)(&trigger_cmd_mutex);
> +	list_for_each_entry_reverse(p, &trigger_commands, list) {
> +		if (p->flags & EVENT_CMD_FL_SYSTEM)
> +			seq_buf_printf(&s, " %s", p->name);
> +	}
> +	seq_buf_putc(&s, '\n');
> +
> +	len = seq_buf_used(&s);
> +
> +	if (*ppos >= len)
> +		return 0;
> +
> +	len -= *ppos;
> +
> +	if (count > len)
> +		count = len;
> +
> +	if (copy_to_user(ubuf, buf + *ppos, count))
> +		return -EFAULT;
> +
> +	*ppos += count;
> +
> +	return count;
> +}
> +
> +static int process_system_events(struct trace_subsystem_dir *dir,
> +				 struct event_command *p, char *buff,
> +				 char *command, char *next)
> +{
> +	struct event_subsystem *system = dir->subsystem;
> +	struct trace_event_file *file;
> +	struct trace_array *tr = dir->tr;
> +	bool remove = false;
> +	int ret = 0;
> +
> +	if (buff[0] == '!')
> +		remove = true;
> +
> +	lockdep_assert_held(&event_mutex);
> +
> +	list_for_each_entry(file, &tr->events, list) {
> +
> +		if (strcmp(system->name, file->event_call->class->system) != 0)
> +			continue;
> +
> +		ret = p->parse(p, file, buff, command, next);
> +
> +		/* Removals and existing events do not error */
> +		if (ret < 0 && ret != -EEXIST && !remove) {
> +			pr_warn("Failed adding trigger %s on %s\n",
> +				command, trace_event_name(file->event_call));
> +		}


Can I expect that this can recover the previous settings
via event trigger?
e.g. 

# echo "stacktrace" > events/sched/sched_wakeup/trigger
# echo "stacktrace" > events/sched/trigger
# echo "!stacktrace" > events/sched/trigger
# cat events/sched/sched_wakeup/trigger
stacktrace:unlimited

?


> +	}
> +	return 0;
> +}
> +
> +static ssize_t
> +event_system_trigger_write(struct file *filp, const char __user *ubuf,
> +		    size_t cnt, loff_t *ppos)
> +{
> +	struct trace_subsystem_dir *dir = filp->private_data;
> +	struct event_command *p;
> +	char *command, *next;
> +	char *buf __free(kfree) = get_user_buf(ubuf, cnt);
> +	bool remove = false;
> +	bool found = false;
> +	ssize_t ret;
> +
> +	if (!buf)
> +		return 0;
> +
> +	if (IS_ERR(buf))
> +		return PTR_ERR(buf);

Ditto.

> +
> +	/* system triggers are not allowed to have counters */
> +	if (strchr(buf, ':'))
> +		return -EINVAL;

':' is not always used for counters (e.g. hist) and it seems odd
to check anything about parse here. Can we do this counter check
after parse a command?

> +
> +	/* If opened for read too, dir is in the seq_file descriptor */
> +	if (filp->f_mode & FMODE_READ) {
> +		struct seq_file *m = filp->private_data;
> +		dir = m->private;
> +	}
> +
> +	/* Skip added space at beginning of buf */
> +	next = strim(buf);
> +
> +	command = strsep(&next, " \t");
> +	if (next) {
> +		next = skip_spaces(next);
> +		if (!*next)
> +			next = NULL;
> +	}

strim() removes both leading and trailing whitespace. So this check is
not required.

Thanks,

> +	if (command[0] == '!') {
> +		remove = true;
> +		command++;
> +	}
> +
> +	guard(mutex)(&event_mutex);
> +	guard(mutex)(&trigger_cmd_mutex);
> +
> +	list_for_each_entry(p, &trigger_commands, list) {
> +		/* Allow to remove any trigger */
> +		if (!remove && !(p->flags & EVENT_CMD_FL_SYSTEM))
> +			continue;
> +		if (strcmp(p->name, command) == 0) {
> +			found = true;
> +			ret = process_system_events(dir, p, buf, command, next);
> +			break;
> +		}
> +	}
> +
> +	if (!found)
> +		ret = -ENODEV;
> +
> +	if (!ret)
> +		*ppos += cnt;
> +
> +	if (remove || ret < 0)
> +		return ret ? : cnt;
> +
> +	return cnt;
> +}
> +
> +static int
> +event_system_trigger_open(struct inode *inode, struct file *file)
> +{
> +	struct trace_subsystem_dir *dir;
> +	int ret;
> +
> +	ret = security_locked_down(LOCKDOWN_TRACEFS);
> +	if (ret)
> +		return ret;
> +
> +	dir = trace_get_system_dir(inode);
> +	if (!dir)
> +		return -ENODEV;
> +
> +	file->private_data = dir;
> +
> +	return ret;
> +}
> +
> +static int
> +event_system_trigger_release(struct inode *inode, struct file *file)
> +{
> +	struct trace_subsystem_dir *dir = inode->i_private;
> +
> +	trace_put_system_dir(dir);
> +
> +	return 0;
> +}
> +
> +const struct file_operations event_system_trigger_fops = {
> +	.open = event_system_trigger_open,
> +	.read = event_system_trigger_read,
> +	.write = event_system_trigger_write,
> +	.llseek = tracing_lseek,
> +	.release = event_system_trigger_release,
> +};
> +
>  /*
>   * Currently we only register event commands from __init, so mark this
>   * __init too.
> @@ -1587,7 +1772,7 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
>  static struct event_command trigger_stacktrace_cmd = {
>  	.name			= "stacktrace",
>  	.trigger_type		= ETT_STACKTRACE,
> -	.flags			= EVENT_CMD_FL_POST_TRIGGER,
> +	.flags			= EVENT_CMD_FL_POST_TRIGGER | EVENT_CMD_FL_SYSTEM,
>  	.parse			= event_trigger_parse,
>  	.reg			= register_trigger,
>  	.unreg			= unregister_trigger,
> -- 
> 2.51.0
> 
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

Re: [PATCH v2] tracing: Add system trigger file to enable triggers for all the system's events

Posted by Steven Rostedt 2 months, 2 weeks ago

On Wed, 26 Nov 2025 15:02:51 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > This also allows to remove a trigger from all events in a subsystem (even
> > if it's not a subsystem trigger!).
> >   
> 
> I have some comments below.

BTW, it's more appropriate to simply trim the email ;-)


> > --- a/kernel/trace/trace_events.c
> > +++ b/kernel/trace/trace_events.c
> > @@ -2168,51 +2168,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
> >  
> >  static LIST_HEAD(event_subsystems);
> >  
> > -static int subsystem_open(struct inode *inode, struct file *filp)
> > +struct trace_subsystem_dir *trace_get_system_dir(struct inode *inode)
> >  {
> > -	struct trace_subsystem_dir *dir = NULL, *iter_dir;
> > -	struct trace_array *tr = NULL, *iter_tr;
> > -	struct event_subsystem *system = NULL;
> > -	int ret;
> > +	struct trace_subsystem_dir *dir;
> > +	struct trace_array *tr = NULL;  
> 
> nit: This also no need to be initialized.

Hmm, I guess this was needed in one of the versions I had before posting.

I'll fix in v3.

> 
> >  
> > -	if (tracing_is_disabled())
> > -		return -ENODEV;
> > +	guard(mutex)(&event_mutex);
> > +	guard(mutex)(&trace_types_lock);
> >  
> >  	/* Make sure the system still exists */
> > -	mutex_lock(&event_mutex);
> > -	mutex_lock(&trace_types_lock);
> > -	list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
> > -		list_for_each_entry(iter_dir, &iter_tr->systems, list) {
> > -			if (iter_dir == inode->i_private) {
> > +	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
> > +		list_for_each_entry(dir, &tr->systems, list) {
> > +			if (dir == inode->i_private) {
> >  				/* Don't open systems with no events */
> > -				tr = iter_tr;
> > -				dir = iter_dir;
> > -				if (dir->nr_events) {
> > -					__get_system_dir(dir);
> > -					system = dir->subsystem;
> > -				}
> > -				goto exit_loop;
> > +				if (!dir->nr_events)
> > +					return NULL;
> > +				if (__trace_array_get(tr) < 0)
> > +					return NULL;
> > +				__get_system_dir(dir);
> > +				return dir;
> >  			}
> >  		}
> >  	}


> >  static ssize_t event_trigger_regex_write(struct file *file,
> >  					 const char __user *ubuf,
> >  					 size_t cnt, loff_t *ppos)
> >  {
> >  	struct trace_event_file *event_file;
> >  	ssize_t ret;
> > -	char *buf __free(kfree) = NULL;
> > +	char *buf __free(kfree) = get_user_buf(ubuf, cnt);
> >  
> > -	if (!cnt)
> > +	if (!buf)
> >  		return 0;
> >  
> > -	if (cnt >= PAGE_SIZE)
> > -		return -EINVAL;
> > -
> > -	buf = memdup_user_nul(ubuf, cnt);
> >  	if (IS_ERR(buf))
> >  		return PTR_ERR(buf);  
> 
> You can simply write:
> 
> 	if (IS_ERR_OR_NULL(buf))
> 		return PTR_ERR_OR_ZERO(buf);

Yes I can. But honestly, the above is much harder to understand what is
happening than the code I had written.

I mean:

	if (!buf)
		return 0;

	if (IS_ERR(buf))
		return PTR_ERR(buf);

is pretty obvious of what is happening.

	if (IS_ERR_OR_NULL(buf))
		return PTR_ERR_OR_ZERO(buf);

Is quite a bit more obfuscated. I mean, I needed to look up what
PTR_ERR_OR_ZERO() did to be sure I knew what was returned.

> > +	list_for_each_entry(file, &tr->events, list) {
> > +
> > +		if (strcmp(system->name, file->event_call->class->system) != 0)
> > +			continue;
> > +
> > +		ret = p->parse(p, file, buff, command, next);
> > +
> > +		/* Removals and existing events do not error */
> > +		if (ret < 0 && ret != -EEXIST && !remove) {
> > +			pr_warn("Failed adding trigger %s on %s\n",
> > +				command, trace_event_name(file->event_call));
> > +		}  
> 
> 
> Can I expect that this can recover the previous settings
> via event trigger?
> e.g. 
> 
> # echo "stacktrace" > events/sched/sched_wakeup/trigger
> # echo "stacktrace" > events/sched/trigger
> # echo "!stacktrace" > events/sched/trigger
> # cat events/sched/sched_wakeup/trigger
> stacktrace:unlimited
> 
> ?

No. In fact, this is one of the features of the system trigger. Writing
into the system/trigger file is the same as writing into each of the
system's event's trigger files one at a time. In fact, I updated the
documentation in this patch to show that this file can be used to clear
tiggers too!

+       echo snapshot > events/sched/sched_waking/trigger
+       cat events/sched/sched_waking/trigger
+       snapshot:unlimited
+       echo '!snapshot' > events/sched/trigger
+       cat events/sched/sched_waking/trigger
+       # Available triggers:
+       # traceon traceoff snapshot stacktrace enable_event disable_event enable_hist disable_hist hist


> 
> 
> > +	}
> > +	return 0;
> > +}
> > +
> > +static ssize_t
> > +event_system_trigger_write(struct file *filp, const char __user *ubuf,
> > +		    size_t cnt, loff_t *ppos)
> > +{
> > +	struct trace_subsystem_dir *dir = filp->private_data;
> > +	struct event_command *p;
> > +	char *command, *next;
> > +	char *buf __free(kfree) = get_user_buf(ubuf, cnt);
> > +	bool remove = false;
> > +	bool found = false;
> > +	ssize_t ret;
> > +
> > +	if (!buf)
> > +		return 0;
> > +
> > +	if (IS_ERR(buf))
> > +		return PTR_ERR(buf);  
> 
> Ditto.

And ditto again with my reply ;-)

> 
> > +
> > +	/* system triggers are not allowed to have counters */
> > +	if (strchr(buf, ':'))
> > +		return -EINVAL;  
> 
> ':' is not always used for counters (e.g. hist) and it seems odd
> to check anything about parse here. Can we do this counter check
> after parse a command?
> 
> > +
> > +	/* If opened for read too, dir is in the seq_file descriptor */
> > +	if (filp->f_mode & FMODE_READ) {
> > +		struct seq_file *m = filp->private_data;
> > +		dir = m->private;
> > +	}
> > +
> > +	/* Skip added space at beginning of buf */
> > +	next = strim(buf);
> > +
> > +	command = strsep(&next, " \t");
> > +	if (next) {
> > +		next = skip_spaces(next);
> > +		if (!*next)
> > +			next = NULL;
> > +	}  
> 
> strim() removes both leading and trailing whitespace. So this check is
> not required.

But next here is not the one that had strim() attached to it.

	command = strsep(&next, " \t");

Updates the content of next.

Thanks for the review,

-- Steve