[v6] Tracefs support for pKVM

[PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Vincent Donnefort 1 month, 1 week ago

For standard trace instance, "trace" files allow to do non-consuming
read of the ring-buffers. This is however not applicable to ring-buffer
remotes due to the lack of support in the meta-page. Nonetheless this
file is still useful to reset a ring-buffer so let's add a write-only
version.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h
index de043a6f2fe0..1aa99da3c96a 100644
--- a/include/linux/trace_remote.h
+++ b/include/linux/trace_remote.h
@@ -11,6 +11,7 @@ struct trace_remote_callbacks {
 	void	(*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv);
 	int	(*enable_tracing)(bool enable, void *priv);
 	int	(*swap_reader_page)(unsigned int cpu, void *priv);
+	int	(*reset)(unsigned int cpu, void *priv);
 };
 
 int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv);
diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index 1a4786b7970c..18b2930186cc 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -63,6 +63,7 @@ static int trace_remote_load(struct trace_remote *remote)
 	rb_remote->desc = remote->trace_buffer_desc;
 	rb_remote->swap_reader_page = remote->cbs->swap_reader_page;
 	rb_remote->priv = remote->priv;
+	rb_remote->reset = remote->cbs->reset;
 	remote->trace_buffer = ring_buffer_remote(rb_remote);
 	if (!remote->trace_buffer) {
 		remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv);
@@ -136,6 +137,21 @@ static int trace_remote_disable_tracing(struct trace_remote *remote)
 	return 0;
 }
 
+static void trace_remote_reset(struct trace_remote *remote, int cpu)
+{
+	lockdep_assert_held(&remote->lock);
+
+	if (!trace_remote_loaded(remote))
+		return;
+
+	if (cpu == RING_BUFFER_ALL_CPUS)
+		ring_buffer_reset(remote->trace_buffer);
+	else
+		ring_buffer_reset_cpu(remote->trace_buffer, cpu);
+
+	trace_remote_try_unload(remote);
+}
+
 static ssize_t
 tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
 {
@@ -370,6 +386,26 @@ static const struct file_operations trace_pipe_fops = {
 	.release	= trace_pipe_release,
 };
 
+static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct inode *inode = file_inode(filp);
+	struct trace_remote *remote = inode->i_private;
+	int cpu = RING_BUFFER_ALL_CPUS;
+
+	if (inode->i_cdev)
+		cpu = (long)inode->i_cdev - 1;
+
+	guard(mutex)(&remote->lock);
+
+	trace_remote_reset(remote, cpu);
+
+	return cnt;
+}
+
+static const struct file_operations trace_fops = {
+	.write		= trace_write,
+};
+
 static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
 {
 	struct dentry *remote_d, *percpu_d;
@@ -400,7 +436,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
 	    !trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
 			       &buffer_size_kb_fops) ||
 	    !trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote,
-			       &trace_pipe_fops))
+			       &trace_pipe_fops) ||
+	    !trace_create_file("trace", 0200, remote_d, remote,
+			       &trace_fops))
 		goto err;
 
 	percpu_d = tracefs_create_dir("per_cpu", remote_d);
@@ -422,7 +460,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
 		}
 
 		if (!trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu,
-					   &trace_pipe_fops))
+					   &trace_pipe_fops) ||
+		    !trace_create_cpu_file("trace", 0200, cpu_d, remote, cpu,
+					   &trace_fops))
 			goto err;
 	}
 
-- 
2.51.0.rc2.233.g662b1ed5c5-goog

Re: [PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Steven Rostedt 3 weeks, 4 days ago

On Thu, 21 Aug 2025 09:13:52 +0100
Vincent Donnefort <vdonnefort@google.com> wrote:

> @@ -400,7 +436,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
>  	    !trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
>  			       &buffer_size_kb_fops) ||
>  	    !trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote,
> -			       &trace_pipe_fops))
> +			       &trace_pipe_fops) ||
> +	    !trace_create_file("trace", 0200, remote_d, remote,
> +			       &trace_fops))
>  		goto err;
>  
>  	percpu_d = tracefs_create_dir("per_cpu", remote_d);
> @@ -422,7 +460,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
>  		}
>  
>  		if (!trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu,
> -					   &trace_pipe_fops))
> +					   &trace_pipe_fops) ||
> +		    !trace_create_cpu_file("trace", 0200, cpu_d, remote, cpu,
> +					   &trace_fops))
>  			goto err;
>  	}

I wonder if we should name the file "reset" to not be confusing to users
when they cat the file and it doesn't produce any output.

-- Steve

Re: [PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Vincent Donnefort 3 weeks, 3 days ago

On Mon, Sep 08, 2025 at 07:37:57PM -0400, Steven Rostedt wrote:
> On Thu, 21 Aug 2025 09:13:52 +0100
> Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> > @@ -400,7 +436,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
> >  	    !trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote,
> >  			       &buffer_size_kb_fops) ||
> >  	    !trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote,
> > -			       &trace_pipe_fops))
> > +			       &trace_pipe_fops) ||
> > +	    !trace_create_file("trace", 0200, remote_d, remote,
> > +			       &trace_fops))
> >  		goto err;
> >  
> >  	percpu_d = tracefs_create_dir("per_cpu", remote_d);
> > @@ -422,7 +460,9 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo
> >  		}
> >  
> >  		if (!trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu,
> > -					   &trace_pipe_fops))
> > +					   &trace_pipe_fops) ||
> > +		    !trace_create_cpu_file("trace", 0200, cpu_d, remote, cpu,
> > +					   &trace_fops))
> >  			goto err;
> >  	}
> 
> I wonder if we should name the file "reset" to not be confusing to users
> when they cat the file and it doesn't produce any output.

My idea was to keep the exact same interface as the rest of the tracing. I could
keep that /trace file for compatibility and add /reset?

"cat trace" could also just returns a text like *** not supported *** ?

> 
> -- Steve

Re: [PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Steven Rostedt 3 weeks, 3 days ago

On Tue, 9 Sep 2025 13:10:25 +0100
Vincent Donnefort <vdonnefort@google.com> wrote:

> > I wonder if we should name the file "reset" to not be confusing to users
> > when they cat the file and it doesn't produce any output.  
> 
> My idea was to keep the exact same interface as the rest of the tracing. I could
> keep that /trace file for compatibility and add /reset?
> 
> "cat trace" could also just returns a text like *** not supported *** ?

If it's never going to be supported, I rather not add it. It not being
there is a sure way of knowing it's not supported. Just adding it because
the normal system has it is actually worse if it doesn't behave the same.

-- Steve

Re: [PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Vincent Donnefort 3 weeks, 3 days ago

On Tue, Sep 09, 2025 at 09:40:28AM -0400, Steven Rostedt wrote:
> On Tue, 9 Sep 2025 13:10:25 +0100
> Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> > > I wonder if we should name the file "reset" to not be confusing to users
> > > when they cat the file and it doesn't produce any output.  
> > 
> > My idea was to keep the exact same interface as the rest of the tracing. I could
> > keep that /trace file for compatibility and add /reset?
> > 
> > "cat trace" could also just returns a text like *** not supported *** ?
> 
> If it's never going to be supported, I rather not add it. It not being
> there is a sure way of knowing it's not supported. Just adding it because
> the normal system has it is actually worse if it doesn't behave the same.

If later we extend the meta-page to support non-consuming read, /trace would
then become useful.

Another argument for non-consuming read would be to enable dump on panic.

But I understand your point, it might be wishful thinking at this point.

> 
> -- Steve

Re: [PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Steven Rostedt 3 weeks, 3 days ago

On Tue, 9 Sep 2025 17:14:35 +0100
Vincent Donnefort <vdonnefort@google.com> wrote:

> On Tue, Sep 09, 2025 at 09:40:28AM -0400, Steven Rostedt wrote:
> > On Tue, 9 Sep 2025 13:10:25 +0100
> > Vincent Donnefort <vdonnefort@google.com> wrote:
> >   
> > > > I wonder if we should name the file "reset" to not be confusing to users
> > > > when they cat the file and it doesn't produce any output.    
> > > 
> > > My idea was to keep the exact same interface as the rest of the tracing. I could
> > > keep that /trace file for compatibility and add /reset?
> > > 
> > > "cat trace" could also just returns a text like *** not supported *** ?  
> > 
> > If it's never going to be supported, I rather not add it. It not being
> > there is a sure way of knowing it's not supported. Just adding it because
> > the normal system has it is actually worse if it doesn't behave the same.  
> 
> If later we extend the meta-page to support non-consuming read, /trace would
> then become useful.
> 
> Another argument for non-consuming read would be to enable dump on panic.
> 
> But I understand your point, it might be wishful thinking at this point.
>

It may be possible to still do a trace file. Basically, it would work the
same way the normal iterator works. Today it reads the trace while the
writer could be modifying it.

Actually, I think there's a bug in the iterator code today :-p

It needs to be modified to copy the event, as it currently passes the event
as is and that event could be written over by the writer.

But anyway, I think it should work for the remote buffers too. Let me go
and fix the current iterator.

-- Steve

Re: [PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Steven Rostedt 3 weeks, 3 days ago

On Tue, 9 Sep 2025 14:39:48 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> But anyway, I think it should work for the remote buffers too. Let me go
> and fix the current iterator.

I thought it was broken but it isn't ;-) I did it properly, I just didn't
look deep enough.

So yeah, look at the rb_iter_head_event() code. The ring buffer iterator
has a copy of the event. It has:

	if ((iter->head + length) > commit || length > iter->event_size)
		/* Writer corrupted the read? */
		goto reset;

	memcpy(iter->event, event, length);
	/*
	 * If the page stamp is still the same after this rmb() then the
	 * event was safely copied without the writer entering the page.
	 */
	smp_rmb();

	/* Make sure the page didn't change since we read this */
	if (iter->page_stamp != iter_head_page->page->time_stamp ||
	    commit > rb_page_commit(iter_head_page))
		goto reset;

It first checks before copying that the data it's about to copy hasn't been
touched by the writer.

It then copies the event into the iter->event temp buffer.

Then it checks again that the data hasn't been touched. If it has, then
consider the data corrupt and end the iteration. This is how the "trace"
file works. I believe you could do the same for the remote code.

If we are gonna keep the "trace" file, let's make sure it's fully
implemented.

-- Steve

Re: [PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Vincent Donnefort 3 weeks, 2 days ago

On Tue, Sep 09, 2025 at 02:52:36PM -0400, Steven Rostedt wrote:
> On Tue, 9 Sep 2025 14:39:48 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > But anyway, I think it should work for the remote buffers too. Let me go
> > and fix the current iterator.
> 
> I thought it was broken but it isn't ;-) I did it properly, I just didn't
> look deep enough.
> 
> So yeah, look at the rb_iter_head_event() code. The ring buffer iterator
> has a copy of the event. It has:
> 
> 	if ((iter->head + length) > commit || length > iter->event_size)
> 		/* Writer corrupted the read? */
> 		goto reset;
> 
> 	memcpy(iter->event, event, length);
> 	/*
> 	 * If the page stamp is still the same after this rmb() then the
> 	 * event was safely copied without the writer entering the page.
> 	 */
> 	smp_rmb();
> 
> 	/* Make sure the page didn't change since we read this */
> 	if (iter->page_stamp != iter_head_page->page->time_stamp ||
> 	    commit > rb_page_commit(iter_head_page))
> 		goto reset;
> 
> It first checks before copying that the data it's about to copy hasn't been
> touched by the writer.
> 
> It then copies the event into the iter->event temp buffer.
> 
> Then it checks again that the data hasn't been touched. If it has, then
> consider the data corrupt and end the iteration. This is how the "trace"
> file works. I believe you could do the same for the remote code.
> 
> If we are gonna keep the "trace" file, let's make sure it's fully
> implemented.

I was more worry about the ring-buffer page order that can be reshuffled on each
swap_reader_page(), making the page links useless in the kernel. Ideally, the
meta-page would keep the page ID order somewhere.

Alternatively, we could walk all the buffer pages to read the timestamp and
re-create the order but that sounds quite cumbersome.

> 
> -- Steve

Re: [PATCH v6 04/24] tracing: Add reset to trace remotes

Posted by Steven Rostedt 3 weeks, 2 days ago

On Wed, 10 Sep 2025 10:45:05 +0100
Vincent Donnefort <vdonnefort@google.com> wrote:

> > If we are gonna keep the "trace" file, let's make sure it's fully
> > implemented.  
> 
> I was more worry about the ring-buffer page order that can be reshuffled on each
> swap_reader_page(), making the page links useless in the kernel. Ideally, the
> meta-page would keep the page ID order somewhere.

Yeah, internally we could keep the order of the pages. Shouldn't be too
hard, as the swapping happens by the kernel.

> 
> Alternatively, we could walk all the buffer pages to read the timestamp and
> re-create the order but that sounds quite cumbersome.

No, having a separate array of the order is probably what we want. Then
every swap_reader_page() will update the array. The trace iterator could
simply walk that array.

-- Steve