For standard trace instance, "trace" files allow to do non-consuming
read of the ring-buffers. This is however not applicable to ring-buffer
remotes due to the lack of support in the meta-page. Nonetheless this
file is still useful to reset a ring-buffer so let's add a write-only
version.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h
index de043a6f2fe0..1aa99da3c96a 100644
--- a/include/linux/trace_remote.h
+++ b/include/linux/trace_remote.h
@@ -11,6 +11,7 @@ struct trace_remote_callbacks {
void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv);
int (*enable_tracing)(bool enable, void *priv);
int (*swap_reader_page)(unsigned int cpu, void *priv);
+ int (*reset)(unsigned int cpu, void *priv);
};
int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv);
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 1a4786b7970c..18b2930186cc 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -63,6 +63,7 @@ static int trace_remote_load(struct trace_remote *remote)
rb_remote->desc = remote->trace_buffer_desc;
rb_remote->swap_reader_page = remote->cbs->swap_reader_page;
rb_remote->priv = remote->priv;
+ rb_remote->reset = remote->cbs->reset;
remote->trace_buffer = ring_buffer_remote(rb_remote);
if (!remote->trace_buffer) {
remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv);
@@ -136,6 +137,21 @@ static int trace_remote_disable_tracing(struct trace_remote *remote)
return 0;
}
+static void trace_remote_reset(struct trace_remote *remote, int cpu)
+{
+ lockdep_assert_held(&remote->lock);
+
+ if (!trace_remote_loaded(remote))
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ ring_buffer_reset(remote->trace_buffer);
+ else
+ ring_buffer_reset_cpu(remote->trace_buffer, cpu);
+
+ trace_remote_try_unload(remote);
+}
+
static ssize_t
tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
{
@@ -370,6 +386,26 @@ static const struct file_operations trace_pipe_fops = {
.release = trace_pipe_release,
};
+static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_remote *remote = inode->i_private;
+ int cpu = RING_BUFFER_ALL_CPUS;
+
+ if (inode->i_cdev)
+ cpu = (long)inode->i_cdev - 1;
+
+ guard(mutex)(&remote->lock);
+
+ trace_remote_reset(remote, cpu);
+
+ return cnt;
+}
+
+static const struct file_operations trace_fops = {
+ .write = trace_write,
+};
+
static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
{
struct dentry *remote_d, *percpu_d;
@@ -400,7 +436,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
!trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
&buffer_size_kb_fops) ||
!trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote,
- &trace_pipe_fops))
+ &trace_pipe_fops) ||
+ !trace_create_file("trace", 0200, remote_d, remote,
+ &trace_fops))
goto err;
percpu_d = tracefs_create_dir("per_cpu", remote_d);
@@ -422,7 +460,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
}
if (!trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu,
- &trace_pipe_fops))
+ &trace_pipe_fops) ||
+ !trace_create_cpu_file("trace", 0200, cpu_d, remote, cpu,
+ &trace_fops))
goto err;
}
--
2.51.0.rc2.233.g662b1ed5c5-goog
On Thu, 21 Aug 2025 09:13:52 +0100 Vincent Donnefort <vdonnefort@google.com> wrote: > @@ -400,7 +436,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo > !trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, > &buffer_size_kb_fops) || > !trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, > - &trace_pipe_fops)) > + &trace_pipe_fops) || > + !trace_create_file("trace", 0200, remote_d, remote, > + &trace_fops)) > goto err; > > percpu_d = tracefs_create_dir("per_cpu", remote_d); > @@ -422,7 +460,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo > } > > if (!trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, > - &trace_pipe_fops)) > + &trace_pipe_fops) || > + !trace_create_cpu_file("trace", 0200, cpu_d, remote, cpu, > + &trace_fops)) > goto err; > } I wonder if we should name the file "reset" to not be confusing to users when they cat the file and it doesn't produce any output. -- Steve
On Mon, Sep 08, 2025 at 07:37:57PM -0400, Steven Rostedt wrote: > On Thu, 21 Aug 2025 09:13:52 +0100 > Vincent Donnefort <vdonnefort@google.com> wrote: > > > @@ -400,7 +436,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo > > !trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, > > &buffer_size_kb_fops) || > > !trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, > > - &trace_pipe_fops)) > > + &trace_pipe_fops) || > > + !trace_create_file("trace", 0200, remote_d, remote, > > + &trace_fops)) > > goto err; > > > > percpu_d = tracefs_create_dir("per_cpu", remote_d); > > @@ -422,7 +460,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo > > } > > > > if (!trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, > > - &trace_pipe_fops)) > > + &trace_pipe_fops) || > > + !trace_create_cpu_file("trace", 0200, cpu_d, remote, cpu, > > + &trace_fops)) > > goto err; > > } > > I wonder if we should name the file "reset" to not be confusing to users > when they cat the file and it doesn't produce any output. My idea was to keep the exact same interface as the rest of the tracing. I could keep that /trace file for compatibility and add /reset? "cat trace" could also just returns a text like *** not supported *** ? > > -- Steve
On Tue, 9 Sep 2025 13:10:25 +0100 Vincent Donnefort <vdonnefort@google.com> wrote: > > I wonder if we should name the file "reset" to not be confusing to users > > when they cat the file and it doesn't produce any output. > > My idea was to keep the exact same interface as the rest of the tracing. I could > keep that /trace file for compatibility and add /reset? > > "cat trace" could also just returns a text like *** not supported *** ? If it's never going to be supported, I rather not add it. It not being there is a sure way of knowing it's not supported. Just adding it because the normal system has it is actually worse if it doesn't behave the same. -- Steve
On Tue, Sep 09, 2025 at 09:40:28AM -0400, Steven Rostedt wrote: > On Tue, 9 Sep 2025 13:10:25 +0100 > Vincent Donnefort <vdonnefort@google.com> wrote: > > > > I wonder if we should name the file "reset" to not be confusing to users > > > when they cat the file and it doesn't produce any output. > > > > My idea was to keep the exact same interface as the rest of the tracing. I could > > keep that /trace file for compatibility and add /reset? > > > > "cat trace" could also just returns a text like *** not supported *** ? > > If it's never going to be supported, I rather not add it. It not being > there is a sure way of knowing it's not supported. Just adding it because > the normal system has it is actually worse if it doesn't behave the same. If later we extend the meta-page to support non-consuming read, /trace would then become useful. Another argument for non-consuming read would be to enable dump on panic. But I understand your point, it might be wishful thinking at this point. > > -- Steve
On Tue, 9 Sep 2025 17:14:35 +0100 Vincent Donnefort <vdonnefort@google.com> wrote: > On Tue, Sep 09, 2025 at 09:40:28AM -0400, Steven Rostedt wrote: > > On Tue, 9 Sep 2025 13:10:25 +0100 > > Vincent Donnefort <vdonnefort@google.com> wrote: > > > > > > I wonder if we should name the file "reset" to not be confusing to users > > > > when they cat the file and it doesn't produce any output. > > > > > > My idea was to keep the exact same interface as the rest of the tracing. I could > > > keep that /trace file for compatibility and add /reset? > > > > > > "cat trace" could also just returns a text like *** not supported *** ? > > > > If it's never going to be supported, I rather not add it. It not being > > there is a sure way of knowing it's not supported. Just adding it because > > the normal system has it is actually worse if it doesn't behave the same. > > If later we extend the meta-page to support non-consuming read, /trace would > then become useful. > > Another argument for non-consuming read would be to enable dump on panic. > > But I understand your point, it might be wishful thinking at this point. > It may be possible to still do a trace file. Basically, it would work the same way the normal iterator works. Today it reads the trace while the writer could be modifying it. Actually, I think there's a bug in the iterator code today :-p It needs to be modified to copy the event, as it currently passes the event as is and that event could be written over by the writer. But anyway, I think it should work for the remote buffers too. Let me go and fix the current iterator. -- Steve
On Tue, 9 Sep 2025 14:39:48 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > But anyway, I think it should work for the remote buffers too. Let me go > and fix the current iterator. I thought it was broken but it isn't ;-) I did it properly, I just didn't look deep enough. So yeah, look at the rb_iter_head_event() code. The ring buffer iterator has a copy of the event. It has: if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; memcpy(iter->event, event, length); /* * If the page stamp is still the same after this rmb() then the * event was safely copied without the writer entering the page. */ smp_rmb(); /* Make sure the page didn't change since we read this */ if (iter->page_stamp != iter_head_page->page->time_stamp || commit > rb_page_commit(iter_head_page)) goto reset; It first checks before copying that the data it's about to copy hasn't been touched by the writer. It then copies the event into the iter->event temp buffer. Then it checks again that the data hasn't been touched. If it has, then consider the data corrupt and end the iteration. This is how the "trace" file works. I believe you could do the same for the remote code. If we are gonna keep the "trace" file, let's make sure it's fully implemented. -- Steve
On Tue, Sep 09, 2025 at 02:52:36PM -0400, Steven Rostedt wrote: > On Tue, 9 Sep 2025 14:39:48 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > But anyway, I think it should work for the remote buffers too. Let me go > > and fix the current iterator. > > I thought it was broken but it isn't ;-) I did it properly, I just didn't > look deep enough. > > So yeah, look at the rb_iter_head_event() code. The ring buffer iterator > has a copy of the event. It has: > > if ((iter->head + length) > commit || length > iter->event_size) > /* Writer corrupted the read? */ > goto reset; > > memcpy(iter->event, event, length); > /* > * If the page stamp is still the same after this rmb() then the > * event was safely copied without the writer entering the page. > */ > smp_rmb(); > > /* Make sure the page didn't change since we read this */ > if (iter->page_stamp != iter_head_page->page->time_stamp || > commit > rb_page_commit(iter_head_page)) > goto reset; > > It first checks before copying that the data it's about to copy hasn't been > touched by the writer. > > It then copies the event into the iter->event temp buffer. > > Then it checks again that the data hasn't been touched. If it has, then > consider the data corrupt and end the iteration. This is how the "trace" > file works. I believe you could do the same for the remote code. > > If we are gonna keep the "trace" file, let's make sure it's fully > implemented. I was more worry about the ring-buffer page order that can be reshuffled on each swap_reader_page(), making the page links useless in the kernel. Ideally, the meta-page would keep the page ID order somewhere. Alternatively, we could walk all the buffer pages to read the timestamp and re-create the order but that sounds quite cumbersome. > > -- Steve
On Wed, 10 Sep 2025 10:45:05 +0100 Vincent Donnefort <vdonnefort@google.com> wrote: > > If we are gonna keep the "trace" file, let's make sure it's fully > > implemented. > > I was more worry about the ring-buffer page order that can be reshuffled on each > swap_reader_page(), making the page links useless in the kernel. Ideally, the > meta-page would keep the page ID order somewhere. Yeah, internally we could keep the order of the pages. Shouldn't be too hard, as the swapping happens by the kernel. > > Alternatively, we could walk all the buffer pages to read the timestamp and > re-create the order but that sounds quite cumbersome. No, having a separate array of the order is probably what we want. Then every swap_reader_page() will update the array. The trace iterator could simply walk that array. -- Steve
© 2016 - 2025 Red Hat, Inc.