kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe,
there are more data being read out than expected.
The root cause is that in tracing_splice_read_pipe(), an entry is found
outside locks, it may be read by multiple readers or consumed by other
reader as starting printing it.
To fix it, change to find entry after holding locks.
Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
---
kernel/trace/trace.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..f169d33b948f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
if (ret <= 0)
goto out_err;
- if (!iter->ent && !trace_find_next_entry_inc(iter)) {
+ trace_event_read_lock();
+ trace_access_lock(iter->cpu_file);
+
+ if (!trace_find_next_entry_inc(iter)) {
+ trace_access_unlock(iter->cpu_file);
+ trace_event_read_unlock();
ret = -EFAULT;
goto out_err;
}
- trace_event_read_lock();
- trace_access_lock(iter->cpu_file);
-
/* Fill as many pages as possible. */
for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
spd.pages[i] = alloc_page(GFP_KERNEL);
--
2.25.1
On Thu, 10 Aug 2023 20:39:05 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe, > there are more data being read out than expected. Honestly the real fix is to prevent that use case. We should probably have access to trace_pipe lock all the per_cpu trace_pipes too. -- Steve
On 2023/8/12 03:25, Steven Rostedt wrote: > On Thu, 10 Aug 2023 20:39:05 +0800 > Zheng Yejian <zhengyejian1@huawei.com> wrote: > >> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe, >> there are more data being read out than expected. > > Honestly the real fix is to prevent that use case. We should probably have > access to trace_pipe lock all the per_cpu trace_pipes too. > > -- Steve > Hi~ Reproduction testcase is show as below, it can always reproduce the issue in v5.10, and after this patch, the testcase passed. In v5.10, when run `cat trace_pipe > /tmp/myfile &`, it call sendfile() to transmit data from trace_pipe into /tmp/myfile. And in kernel, .splice_read() of trace_pipe is called then the issue is reproduced. However in the newest v6.5, this reproduction case didn't run into the .splice_read() of trace_pipe, because after commit 97ef77c52b78 ("fs: check FMODE_LSEEK to control internal pipe splicing"), non-seekable trace_pipe cannot be sendfile-ed. ``` repro.sh #!/bin/bash do_test() { local trace_dir=/sys/kernel/tracing local trace=${trace_dir}/trace local old_trace_lines local new_trace_lines local tempfiles local testlog="trace pipe concurrency issue" local pipe_pids local i local write_cnt=1000 local read_cnt=0 local nr_cpu=`nproc` # 1. At first, clear all ring buffer echo > ${trace} # 2. Count how many lines in trace file now old_trace_lines=`cat ${trace} | wc -l` # 3. Close water mark so that reader can read as event comes echo 0 > ${trace_dir}/buffer_percent # 4. Read percpu trace_pipes into local file on background. # Splice read must be used under command 'cat' so that the racy # issue can be reproduced !!! i=0 while [ ${i} -lt ${nr_cpu} ]; do tempfiles[${i}]=/tmp/percpu_trace_pipe_${i} cat ${trace_dir}/per_cpu/cpu${i}/trace_pipe > ${tempfiles[${i}]} & pipe_pids[${i}]=$! let i=i+1 done # 5. Read main trace_pipe into local file on background. # The same, splice read must be used to reproduce the issue !!! tempfiles[${i}]=/tmp/main_trace_pipe cat ${trace_dir}/trace_pipe > ${tempfiles[${i}]} & pipe_pids[${i}]=$! echo "Take a break, let readers run." sleep 3 # 6. Write events into ring buffer through trace_marker, so that # hungry readers start racing these events. i=0 while [ ${i} -lt ${write_cnt} ]; do echo "${testlog} <${i}>" > ${trace_dir}/trace_marker let i=i+1 done # 7. Wait until all events being consumed new_trace_lines=`cat ${trace} | wc -l` while [ "${new_trace_lines}" != "${old_trace_lines}" ]; do new_trace_lines=`cat ${trace} | wc -l` sleep 1 done echo "All written events have been consumed." # 8. Kill all readers and count the events readed i=0 while [ ${i} -lt ${#pipe_pids[*]} ]; do local num kill -9 ${pipe_pids[${i}]} wait ${pipe_pids[${i}]} num=`cat ${tempfiles[${i}]} | grep "${testlog}" | wc -l` let read_cnt=read_cnt+num let i=i+1 done # 9. Expect to read events as much as write if [ "${read_cnt}" != "${write_cnt}" ]; then echo "Test fail: write ${write_cnt} but read ${read_cnt} !!!" return 1 fi # 10. Clean temp files if test success i=0 while [ ${i} -lt ${#tempfiles[*]} ]; do rm ${tempfiles[${i}]} let i=i+1 done return 0 } do_test ``` -- Zheng Yejian
On Sat, 12 Aug 2023 15:38:12 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > Reproduction testcase is show as below, it can always reproduce the > issue in v5.10, and after this patch, the testcase passed. > > In v5.10, when run `cat trace_pipe > /tmp/myfile &`, it call > sendfile() to transmit data from trace_pipe into /tmp/myfile. And in > kernel, .splice_read() of trace_pipe is called then the issue is > reproduced. > > However in the newest v6.5, this reproduction case didn't run into the > .splice_read() of trace_pipe, because after commit 97ef77c52b78 ("fs: > check FMODE_LSEEK to control internal pipe splicing"), non-seekable > trace_pipe cannot be sendfile-ed. So the test case cannot be run because the "sendfile" on the trace_pipe now fails? I'm not sure if this is considered a user space regression or not, but I figured I'd let the interested parties know. I don't know what tools out there records trace_pipe via sendfile, but there might be some. -- Steve
On Sat, 12 Aug 2023 at 18:13, Steven Rostedt <rostedt@goodmis.org> wrote: > > So the test case cannot be run because the "sendfile" on the > trace_pipe now fails? So sendfile() has always required a seekable source, because it - intentionally - doesn't do the "splice to pipe and then splice from pipe to destination". It just does a "splice from source, splice result to destination". That sounds like "obviously the right thing to do", but it means that there is now no buffer with any extended lifetime between the two operations. And that's a big deal. Without that buffer (ie pipe) in the middle, if the splice to destination fails - or is partial - and the system call is interrupted by a signal, then the source splice data is now gone, gone, gone. So it only works if the source can then re-create the data - ie if the source is seekable. In that case, if the destination cannot accept all the data, the sendfile can just go back and read again from the last successfully written position. And if you are a data stream that can't seek, then that "from last successfully written position" doesn't work, and any partial writes will just have dropped the data. This seekability test *used* to be to test that the source was either a regular file or a block device. Now it literally checks "can I seek". It looks like the trace_pipe fiel is *claiming* to be a regular file - so sendfile() used to be ok with it - but cannot actually seek - so sendfile would silently drop data. Now sendfile says "I'm sorry, Dave, I'm afraid I can't do that" rather than silently causing data loss. Now, this is not a user-visible regression in this case, because "cat" will always fall back on just regular read/write when sendfile fails. So all that changed for 'trace_pipe' is that a buggy splice now no longer triggers the bug that it used to (which the patch in question was also trying to fix). End result: things in many ways work better this way. So it really looks like it never really worked before either. There was *both* the dropped data bug because "trace_pipe" lied about being a regular file, *and* apparently this trace_pipe race bug that Zheng Yejian tried to fix. Of course, some destinations always accept a full write, so maybe we could relax the "source must be seekable" to be "source must be seekable, _OR_ destination must be something that never returns partial writes". So sendfile to a POSIX-compliant regular file could always work (ignoring filesystem full situations, and at that point I think we can say "yeah, we're done, no recovering from that in sendfile()"). So if somebody really *really* want sendfile to work for this case, then you (a) do need to fix the race in tracing_splice_read_pipe (which you should do anyway, since you can obviously always use splice() itself, not sendfile()). AND (b) figure out when 'splice_write()' will always succeed fully and convince people that we can do that "extended version" of sendfile(). Hmm? Linus
On Sun, 13 Aug 2023 09:41:33 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote: > So if somebody really *really* want sendfile to work for this case, then you > > (a) do need to fix the race in tracing_splice_read_pipe (which you > should do anyway, since you can obviously always use splice() itself, > not sendfile()). > > AND > > (b) figure out when 'splice_write()' will always succeed fully and > convince people that we can do that "extended version" of sendfile(). > No big deal. I really don't care about splicing trace_pipe anyway. That was added by others, but the trace_pipe_raw is what really should be used. Zheng's race needs to be fixed regardless, but I just wanted to make sure this wasn't some kind of hidden regression, as there were patches to trace_pipe to make sendfile get fixed in the past. a29054d9478d0 ("tracing: Fix crash from reading trace_pipe with sendfile") But that too was triggered by cat. If cat no longer uses sendfile for trace_pipe, I'm fine with it. -- Steve
On 2023/8/12 03:25, Steven Rostedt wrote: > On Thu, 10 Aug 2023 20:39:05 +0800 > Zheng Yejian <zhengyejian1@huawei.com> wrote: > >> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe, >> there are more data being read out than expected. Sorry, I didn't make clear here. It not just read more but also lost some data. My case is that, for example: 1) Inject 3 events into ring_buffer: event1, event2, event3; 2) Concurrently splice_read through trace_pipes; 3) Then actually read out: event1, event3, event3. No event2, but 2 event3. > > Honestly the real fix is to prevent that use case. We should probably have > access to trace_pipe lock all the per_cpu trace_pipes too. Yes, we could do that, but would it seem not that effective? because per_cpu trace_pipe only read its own ring_buffer and not race with ring_buffers in other cpus. > > -- Steve >
On Sat, 12 Aug 2023 09:45:52 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > On 2023/8/12 03:25, Steven Rostedt wrote: > > On Thu, 10 Aug 2023 20:39:05 +0800 > > Zheng Yejian <zhengyejian1@huawei.com> wrote: > > > >> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe, > >> there are more data being read out than expected. > > Sorry, I didn't make clear here. It not just read more but also lost > some data. My case is that, for example: > 1) Inject 3 events into ring_buffer: event1, event2, event3; > 2) Concurrently splice_read through trace_pipes; > 3) Then actually read out: event1, event3, event3. No event2, but 2 > event3. > > > > > Honestly the real fix is to prevent that use case. We should probably have > > access to trace_pipe lock all the per_cpu trace_pipes too. > > Yes, we could do that, but would it seem not that effective? > because per_cpu trace_pipe only read its own ring_buffer and not race > with ring_buffers in other cpus. I think Steve said that only one of below is usable. - Read trace_pipe or - Read per_cpu/cpu*/trace_pipe concurrently And I think this makes sence, especially if you use splice (this *moves* the page from the ring_buffer to other pipe). Thank you, > > > > > -- Steve > > > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Thu, 10 Aug 2023 20:39:05 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe, > there are more data being read out than expected. > > The root cause is that in tracing_splice_read_pipe(), an entry is found > outside locks, it may be read by multiple readers or consumed by other > reader as starting printing it. > > To fix it, change to find entry after holding locks. > > Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer") > Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> > --- > kernel/trace/trace.c | 10 ++++++---- > 1 file changed, 6 insertions(+), 4 deletions(-) > > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > index b8870078ef58..f169d33b948f 100644 > --- a/kernel/trace/trace.c > +++ b/kernel/trace/trace.c > @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, > if (ret <= 0) > goto out_err; > > - if (!iter->ent && !trace_find_next_entry_inc(iter)) { > + trace_event_read_lock(); > + trace_access_lock(iter->cpu_file); > + > + if (!trace_find_next_entry_inc(iter)) { It seems you skips '!iter->ent' check. Is there any reason for this change? Thank you, > + trace_access_unlock(iter->cpu_file); > + trace_event_read_unlock(); > ret = -EFAULT; > goto out_err; > } > > - trace_event_read_lock(); > - trace_access_lock(iter->cpu_file); > - > /* Fill as many pages as possible. */ > for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { > spd.pages[i] = alloc_page(GFP_KERNEL); > -- > 2.25.1 > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On 2023/8/11 19:42, Masami Hiramatsu (Google) wrote: > On Thu, 10 Aug 2023 20:39:05 +0800 > Zheng Yejian <zhengyejian1@huawei.com> wrote: > >> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe, >> there are more data being read out than expected. >> >> The root cause is that in tracing_splice_read_pipe(), an entry is found >> outside locks, it may be read by multiple readers or consumed by other >> reader as starting printing it. >> >> To fix it, change to find entry after holding locks. >> >> Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer") >> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> >> --- >> kernel/trace/trace.c | 10 ++++++---- >> 1 file changed, 6 insertions(+), 4 deletions(-) >> >> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c >> index b8870078ef58..f169d33b948f 100644 >> --- a/kernel/trace/trace.c >> +++ b/kernel/trace/trace.c >> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, >> if (ret <= 0) >> goto out_err; >> >> - if (!iter->ent && !trace_find_next_entry_inc(iter)) { >> + trace_event_read_lock(); >> + trace_access_lock(iter->cpu_file); >> + >> + if (!trace_find_next_entry_inc(iter)) { > > It seems you skips '!iter->ent' check. Is there any reason for this change? IIUC, 'iter->ent' may be the entry that was found but not consumed in last call tracing_splice_read_pipe(), and in this call, 'iter->ent' may have being consumed, so we may should find a new 'iter->ent' before printing it in tracing_fill_pipe_page(), see following reduced codes: tracing_splice_read_pipe() { if (!iter->ent && !trace_find_next_entry_inc(iter)) { // 1. find entry here ... ... } tracing_fill_pipe_page() { for (;;) { ... ... ret = print_trace_line(iter); // 2. print entry ... ... if (!trace_find_next_entry_inc()) { // 3. find next entry ... ... break; } } -- Thanks, Zheng Yejian > > Thank you, > >> + trace_access_unlock(iter->cpu_file); >> + trace_event_read_unlock(); >> ret = -EFAULT; >> goto out_err; >> } >> >> - trace_event_read_lock(); >> - trace_access_lock(iter->cpu_file); >> - >> /* Fill as many pages as possible. */ >> for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { >> spd.pages[i] = alloc_page(GFP_KERNEL); >> -- >> 2.25.1 >> > >
On Fri, 11 Aug 2023 20:37:07 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > On 2023/8/11 19:42, Masami Hiramatsu (Google) wrote: > > On Thu, 10 Aug 2023 20:39:05 +0800 > > Zheng Yejian <zhengyejian1@huawei.com> wrote: > > > >> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe, > >> there are more data being read out than expected. > >> > >> The root cause is that in tracing_splice_read_pipe(), an entry is found > >> outside locks, it may be read by multiple readers or consumed by other > >> reader as starting printing it. > >> > >> To fix it, change to find entry after holding locks. > >> > >> Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer") > >> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> > >> --- > >> kernel/trace/trace.c | 10 ++++++---- > >> 1 file changed, 6 insertions(+), 4 deletions(-) > >> > >> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > >> index b8870078ef58..f169d33b948f 100644 > >> --- a/kernel/trace/trace.c > >> +++ b/kernel/trace/trace.c > >> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, > >> if (ret <= 0) > >> goto out_err; > >> > >> - if (!iter->ent && !trace_find_next_entry_inc(iter)) { > >> + trace_event_read_lock(); > >> + trace_access_lock(iter->cpu_file); > >> + > >> + if (!trace_find_next_entry_inc(iter)) { > > > > It seems you skips '!iter->ent' check. Is there any reason for this change? > > IIUC, 'iter->ent' may be the entry that was found but not consumed > in last call tracing_splice_read_pipe(), and in this call, 'iter->ent' > may have being consumed, so we may should find a new 'iter->ent' before > printing it in tracing_fill_pipe_page(), see following reduced codes: And if it wasn't consumed? We just lost it? > > tracing_splice_read_pipe() { > if (!iter->ent && !trace_find_next_entry_inc(iter)) { // 1. find > entry here > ... ... > } > tracing_fill_pipe_page() { > for (;;) { > ... ... > ret = print_trace_line(iter); // 2. print entry > ... ... You missed: count = trace_seq_used(&iter->seq) - save_len; if (rem < count) { rem = 0; iter->seq.seq.len = save_len; Where the above just threw away what was printed in the above "print_trace_line()", and it never went to console. break; } -- Steve > if (!trace_find_next_entry_inc()) { // 3. find next entry > ... ... > break; > } > } > > -- > > Thanks, > Zheng Yejian > > > > > Thank you, > > > >> + trace_access_unlock(iter->cpu_file); > >> + trace_event_read_unlock(); > >> ret = -EFAULT; > >> goto out_err; > >> } > >> > >> - trace_event_read_lock(); > >> - trace_access_lock(iter->cpu_file); > >> - > >> /* Fill as many pages as possible. */ > >> for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { > >> spd.pages[i] = alloc_page(GFP_KERNEL); > >> -- > >> 2.25.1 > >> > > > >
On 2023/8/12 03:24, Steven Rostedt wrote: > On Fri, 11 Aug 2023 20:37:07 +0800 > Zheng Yejian <zhengyejian1@huawei.com> wrote: > >> On 2023/8/11 19:42, Masami Hiramatsu (Google) wrote: >>> On Thu, 10 Aug 2023 20:39:05 +0800 >>> Zheng Yejian <zhengyejian1@huawei.com> wrote: >>> >>>> When concurrently splice_read file trace_pipe and per_cpu/cpu*/trace_pipe, >>>> there are more data being read out than expected. >>>> >>>> The root cause is that in tracing_splice_read_pipe(), an entry is found >>>> outside locks, it may be read by multiple readers or consumed by other >>>> reader as starting printing it. >>>> >>>> To fix it, change to find entry after holding locks. >>>> >>>> Fixes: 7e53bd42d14c ("tracing: Consolidate protection of reader access to the ring buffer") >>>> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> >>>> --- >>>> kernel/trace/trace.c | 10 ++++++---- >>>> 1 file changed, 6 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c >>>> index b8870078ef58..f169d33b948f 100644 >>>> --- a/kernel/trace/trace.c >>>> +++ b/kernel/trace/trace.c >>>> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, >>>> if (ret <= 0) >>>> goto out_err; >>>> >>>> - if (!iter->ent && !trace_find_next_entry_inc(iter)) { >>>> + trace_event_read_lock(); >>>> + trace_access_lock(iter->cpu_file); >>>> + >>>> + if (!trace_find_next_entry_inc(iter)) { >>> >>> It seems you skips '!iter->ent' check. Is there any reason for this change? >> >> IIUC, 'iter->ent' may be the entry that was found but not consumed >> in last call tracing_splice_read_pipe(), and in this call, 'iter->ent' >> may have being consumed, so we may should find a new 'iter->ent' before >> printing it in tracing_fill_pipe_page(), see following reduced codes: > > And if it wasn't consumed? We just lost it? If 'iter->ent' was not consumed, trace_find_next_entry_inc() will find it again, will it? -- Zheng Yejian > >> >> tracing_splice_read_pipe() { >> if (!iter->ent && !trace_find_next_entry_inc(iter)) { // 1. find >> entry here >> ... ... >> } >> tracing_fill_pipe_page() { >> for (;;) { >> ... ... >> ret = print_trace_line(iter); // 2. print entry >> ... ... > > You missed: > > count = trace_seq_used(&iter->seq) - save_len; > if (rem < count) { > rem = 0; > iter->seq.seq.len = save_len; > > Where the above just threw away what was printed in the above > "print_trace_line()", and it never went to console. > > break; > } > Thanks for pointing this out! -- Zheng Yejian > -- Steve > > >> if (!trace_find_next_entry_inc()) { // 3. find next entry >> ... ... >> break; >> } >> } >> >> -- >> >> Thanks, >> Zheng Yejian >> >>> >>> Thank you, >>> >>>> + trace_access_unlock(iter->cpu_file); >>>> + trace_event_read_unlock(); >>>> ret = -EFAULT; >>>> goto out_err; >>>> } >>>> >>>> - trace_event_read_lock(); >>>> - trace_access_lock(iter->cpu_file); >>>> - >>>> /* Fill as many pages as possible. */ >>>> for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { >>>> spd.pages[i] = alloc_page(GFP_KERNEL); >>>> -- >>>> 2.25.1 >>>> >>> >>> > >
On Sat, 12 Aug 2023 10:22:43 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > > >>>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > >>>> index b8870078ef58..f169d33b948f 100644 > >>>> --- a/kernel/trace/trace.c > >>>> +++ b/kernel/trace/trace.c > >>>> @@ -7054,14 +7054,16 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, > >>>> if (ret <= 0) > >>>> goto out_err; > >>>> > >>>> - if (!iter->ent && !trace_find_next_entry_inc(iter)) { > >>>> + trace_event_read_lock(); > >>>> + trace_access_lock(iter->cpu_file); > >>>> + > >>>> + if (!trace_find_next_entry_inc(iter)) { > >>> > >>> It seems you skips '!iter->ent' check. Is there any reason for this change? > >> > >> IIUC, 'iter->ent' may be the entry that was found but not consumed > >> in last call tracing_splice_read_pipe(), and in this call, 'iter->ent' > >> may have being consumed, so we may should find a new 'iter->ent' before > >> printing it in tracing_fill_pipe_page(), see following reduced codes: > > > > And if it wasn't consumed? We just lost it? > > If 'iter->ent' was not consumed, trace_find_next_entry_inc() will find > it again, will it? > > -- Zheng Yejian > > > > >> > >> tracing_splice_read_pipe() { > >> if (!iter->ent && !trace_find_next_entry_inc(iter)) { // 1. find > >> entry here > >> ... ... > >> } > >> tracing_fill_pipe_page() { > >> for (;;) { > >> ... ... > >> ret = print_trace_line(iter); // 2. print entry > >> ... ... > > > > You missed: > > > > count = trace_seq_used(&iter->seq) - save_len; > > if (rem < count) { > > rem = 0; > > iter->seq.seq.len = save_len; > > > > Where the above just threw away what was printed in the above > > "print_trace_line()", and it never went to console. > > > > break; > > } > > > > Thanks for pointing this out! Just to get this moving again, I believe we should add a ref count to trace_pipe and the per_cpu trace_pipes, where if they are opened, nothing else can read it. Opening trace_pipe locks all per_cpu ref counts, if any of them are open, then the trace_pipe open will fail (and releases any ref counts it had taken). Opening a per_cpu trace_pipe will up the ref count for just that CPU buffer. This will allow multiple tasks to read different per_cpu trace_pipe files, but will prevent the main trace_pipe file from being opened. Does that work for this? -- Steve
There is race issue when concurrently splice_read main trace_pipe and
per_cpu trace_pipes which will result in data read out being different
from what actually writen.
As suggested by Steven:
> I believe we should add a ref count to trace_pipe and the per_cpu
> trace_pipes, where if they are opened, nothing else can read it.
>
> Opening trace_pipe locks all per_cpu ref counts, if any of them are
> open, then the trace_pipe open will fail (and releases any ref counts
> it had taken).
>
> Opening a per_cpu trace_pipe will up the ref count for just that
> CPU buffer. This will allow multiple tasks to read different per_cpu
> trace_pipe files, but will prevent the main trace_pipe file from
> being opened.
But because we only need to know whether per_cpu trace_pipe is open or
not, using a cpumask instead of using ref count may be easier.
After this patch, users will find that:
- Main trace_pipe can be opened by only one user, and if it is
opened, all per_cpu trace_pipes cannot be opened;
- Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
trace_pipe can only be opened by one user. And if one of them is
opened, main trace_pipe cannot be opened.
Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
---
> Does that work for this?
Hi, Steven, how about using a cpumask instead of ref count?
This patch will also prevent main trace_pipe or anyone per_cpu trace_pipe
from being opened by multiple people at the same time.
kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------
kernel/trace/trace.h | 2 ++
2 files changed, 51 insertions(+), 7 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..73f6f4d10d43 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6705,24 +6705,54 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
#endif
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else {
+ if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
int ret;
+ int cpu = tracing_get_cpu(inode);
ret = tracing_check_open_get_tr(tr);
if (ret)
return ret;
mutex_lock(&trace_types_lock);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}
trace_seq_init(&iter->seq);
@@ -6745,7 +6775,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6755,12 +6785,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
tr->trace_ref++;
-out:
+
mutex_unlock(&trace_types_lock);
return ret;
fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6777,7 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
free_cpumask_var(iter->started);
@@ -9441,6 +9474,9 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9482,6 +9518,7 @@ static struct trace_array *trace_array_create(const char *name)
out_free_tr:
ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -9584,6 +9621,7 @@ static int __remove_instance(struct trace_array *tr)
}
kfree(tr->topts);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -10381,12 +10419,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
+ if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();
@@ -10439,6 +10479,8 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..53ac0f7780c2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@ struct trace_array {
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
--
2.25.1
On Thu, 17 Aug 2023 19:50:57 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > There is race issue when concurrently splice_read main trace_pipe and > per_cpu trace_pipes which will result in data read out being different > from what actually writen. > > As suggested by Steven: > > I believe we should add a ref count to trace_pipe and the per_cpu > > trace_pipes, where if they are opened, nothing else can read it. > > > > Opening trace_pipe locks all per_cpu ref counts, if any of them are > > open, then the trace_pipe open will fail (and releases any ref counts > > it had taken). > > > > Opening a per_cpu trace_pipe will up the ref count for just that > > CPU buffer. This will allow multiple tasks to read different per_cpu > > trace_pipe files, but will prevent the main trace_pipe file from > > being opened. > > But because we only need to know whether per_cpu trace_pipe is open or > not, using a cpumask instead of using ref count may be easier. > > After this patch, users will find that: > - Main trace_pipe can be opened by only one user, and if it is > opened, all per_cpu trace_pipes cannot be opened; > - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu > trace_pipe can only be opened by one user. And if one of them is > opened, main trace_pipe cannot be opened. > > Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org> > Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> > --- > > > Does that work for this? > > Hi, Steven, how about using a cpumask instead of ref count? > This patch will also prevent main trace_pipe or anyone per_cpu trace_pipe > from being opened by multiple people at the same time. I'm fine with the CPU mask. > > kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------ > kernel/trace/trace.h | 2 ++ > 2 files changed, 51 insertions(+), 7 deletions(-) > > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > index b8870078ef58..73f6f4d10d43 100644 > --- a/kernel/trace/trace.c > +++ b/kernel/trace/trace.c > @@ -6705,24 +6705,54 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, > > #endif > > +static int open_pipe_on_cpu(struct trace_array *tr, int cpu) > +{ > + if (cpu == RING_BUFFER_ALL_CPUS) { > + if (cpumask_empty(tr->pipe_cpumask)) { > + cpumask_setall(tr->pipe_cpumask); > + return 0; > + } > + } else { > + if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { > + cpumask_set_cpu(cpu, tr->pipe_cpumask); > + return 0; > + } > + } > + return -EBUSY; The above would look nicer if you had that be } else if () { if (cpu == RING_BUFFER_ALL_CPUS) { if (cpumask_empty(tr->pipe_cpumask)) { cpumask_setall(tr->pipe_cpumask); return 0; } } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { cpumask_set_cpu(cpu, tr->pipe_cpumask); return 0; } return -EBUSY; > +} > + > +static void close_pipe_on_cpu(struct trace_array *tr, int cpu) > +{ > + if (cpu == RING_BUFFER_ALL_CPUS) { > + WARN_ON(!cpumask_full(tr->pipe_cpumask)); > + cpumask_clear(tr->pipe_cpumask); > + } else { > + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); > + cpumask_clear_cpu(cpu, tr->pipe_cpumask); > + } > +} > + > static int tracing_open_pipe(struct inode *inode, struct file *filp) > { > struct trace_array *tr = inode->i_private; > struct trace_iterator *iter; > int ret; > + int cpu = tracing_get_cpu(inode); tracing_get_cpu() must be called after tracing_check_open_get_tr(), otherwise it may not be reliable. Also, keep "int ret;" as the last declaration. -- Steve > > ret = tracing_check_open_get_tr(tr); > if (ret) > return ret; > > mutex_lock(&trace_types_lock); > + ret = open_pipe_on_cpu(tr, cpu); > + if (ret) > + goto fail_pipe_on_cpu; > > /* create a buffer to store the information to pass to userspace */ > iter = kzalloc(sizeof(*iter), GFP_KERNEL); > if (!iter) { > ret = -ENOMEM; > - __trace_array_put(tr); > - goto out; > + goto fail_alloc_iter; > } > > trace_seq_init(&iter->seq); > @@ -6745,7 +6775,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) > > iter->tr = tr; > iter->array_buffer = &tr->array_buffer; > - iter->cpu_file = tracing_get_cpu(inode); > + iter->cpu_file = cpu; > mutex_init(&iter->mutex); > filp->private_data = iter; > > @@ -6755,12 +6785,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) > nonseekable_open(inode, filp); > > tr->trace_ref++; > -out: > + > mutex_unlock(&trace_types_lock); > return ret; > > fail: > kfree(iter); > +fail_alloc_iter: > + close_pipe_on_cpu(tr, cpu); > +fail_pipe_on_cpu: > __trace_array_put(tr); > mutex_unlock(&trace_types_lock); > return ret; > @@ -6777,7 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) > > if (iter->trace->pipe_close) > iter->trace->pipe_close(iter); > - > + close_pipe_on_cpu(tr, iter->cpu_file); > mutex_unlock(&trace_types_lock); > > free_cpumask_var(iter->started); > @@ -9441,6 +9474,9 @@ static struct trace_array *trace_array_create(const char *name) > if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) > goto out_free_tr; > > + if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) > + goto out_free_tr; > + > tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; > > cpumask_copy(tr->tracing_cpumask, cpu_all_mask); > @@ -9482,6 +9518,7 @@ static struct trace_array *trace_array_create(const char *name) > out_free_tr: > ftrace_free_ftrace_ops(tr); > free_trace_buffers(tr); > + free_cpumask_var(tr->pipe_cpumask); > free_cpumask_var(tr->tracing_cpumask); > kfree(tr->name); > kfree(tr); > @@ -9584,6 +9621,7 @@ static int __remove_instance(struct trace_array *tr) > } > kfree(tr->topts); > > + free_cpumask_var(tr->pipe_cpumask); > free_cpumask_var(tr->tracing_cpumask); > kfree(tr->name); > kfree(tr); > @@ -10381,12 +10419,14 @@ __init static int tracer_alloc_buffers(void) > if (trace_create_savedcmd() < 0) > goto out_free_temp_buffer; > > + if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) > + goto out_free_savedcmd; > + > /* TODO: make the number of buffers hot pluggable with CPUS */ > if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { > MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); > - goto out_free_savedcmd; > + goto out_free_pipe_cpumask; > } > - > if (global_trace.buffer_disabled) > tracing_off(); > > @@ -10439,6 +10479,8 @@ __init static int tracer_alloc_buffers(void) > > return 0; > > +out_free_pipe_cpumask: > + free_cpumask_var(global_trace.pipe_cpumask); > out_free_savedcmd: > free_saved_cmdlines_buffer(savedcmd); > out_free_temp_buffer: > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h > index e1edc2197fc8..53ac0f7780c2 100644 > --- a/kernel/trace/trace.h > +++ b/kernel/trace/trace.h > @@ -377,6 +377,8 @@ struct trace_array { > struct list_head events; > struct trace_event_file *trace_marker_file; > cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ > + /* one per_cpu trace_pipe can be opened by only one user */ > + cpumask_var_t pipe_cpumask; > int ref; > int trace_ref; > #ifdef CONFIG_FUNCTION_TRACER
There is race issue when concurrently splice_read main trace_pipe and
per_cpu trace_pipes which will result in data read out being different
from what actually writen.
As suggested by Steven:
> I believe we should add a ref count to trace_pipe and the per_cpu
> trace_pipes, where if they are opened, nothing else can read it.
>
> Opening trace_pipe locks all per_cpu ref counts, if any of them are
> open, then the trace_pipe open will fail (and releases any ref counts
> it had taken).
>
> Opening a per_cpu trace_pipe will up the ref count for just that
> CPU buffer. This will allow multiple tasks to read different per_cpu
> trace_pipe files, but will prevent the main trace_pipe file from
> being opened.
But because we only need to know whether per_cpu trace_pipe is open or
not, using a cpumask instead of using ref count may be easier.
After this patch, users will find that:
- Main trace_pipe can be opened by only one user, and if it is
opened, all per_cpu trace_pipes cannot be opened;
- Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
trace_pipe can only be opened by one user. And if one of them is
opened, main trace_pipe cannot be opened.
Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
---
v2:
- In open_pipe_on_cpu(), clean format of if statements.
- In tracing_open_pipe(), call tracing_get_cpu() after
tracing_check_open_get_tr() and within trace_types_lock,
also keep "int ret;" as the last declaration.
Link: https://lore.kernel.org/all/20230817101331.21ab6b33@gandalf.local.home/
v1:
- Link: https://lore.kernel.org/all/20230817115057.1637676-1-zhengyejian1@huawei.com/
kernel/trace/trace.c | 55 ++++++++++++++++++++++++++++++++++++++------
kernel/trace/trace.h | 2 ++
2 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..c888a0a2c0e2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6705,10 +6705,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
#endif
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
+ int cpu;
int ret;
ret = tracing_check_open_get_tr(tr);
@@ -6716,13 +6742,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
return ret;
mutex_lock(&trace_types_lock);
+ cpu = tracing_get_cpu(inode);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}
trace_seq_init(&iter->seq);
@@ -6745,7 +6774,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6755,12 +6784,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
tr->trace_ref++;
-out:
+
mutex_unlock(&trace_types_lock);
return ret;
fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6777,7 +6809,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
free_cpumask_var(iter->started);
@@ -9441,6 +9473,9 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9482,6 +9517,7 @@ static struct trace_array *trace_array_create(const char *name)
out_free_tr:
ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -9584,6 +9620,7 @@ static int __remove_instance(struct trace_array *tr)
}
kfree(tr->topts);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -10381,12 +10418,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
+ if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();
@@ -10439,6 +10478,8 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..53ac0f7780c2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@ struct trace_array {
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
--
2.25.1
On Fri, 18 Aug 2023 10:26:45 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > There is race issue when concurrently splice_read main trace_pipe and > per_cpu trace_pipes which will result in data read out being different > from what actually writen. > > As suggested by Steven: > > I believe we should add a ref count to trace_pipe and the per_cpu > > trace_pipes, where if they are opened, nothing else can read it. > > > > Opening trace_pipe locks all per_cpu ref counts, if any of them are > > open, then the trace_pipe open will fail (and releases any ref counts > > it had taken). > > > > Opening a per_cpu trace_pipe will up the ref count for just that > > CPU buffer. This will allow multiple tasks to read different per_cpu > > trace_pipe files, but will prevent the main trace_pipe file from > > being opened. > > But because we only need to know whether per_cpu trace_pipe is open or > not, using a cpumask instead of using ref count may be easier. > > After this patch, users will find that: > - Main trace_pipe can be opened by only one user, and if it is > opened, all per_cpu trace_pipes cannot be opened; > - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu > trace_pipe can only be opened by one user. And if one of them is > opened, main trace_pipe cannot be opened. > > Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org> > Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Looks good to me. Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> BTW, this reminds me that splice read support had been introduced for virtio-trace support. I've tested it with/without this patch, but I hit a different issue, which seems like a virtio-serial issue (per-cpu buffers on the guest are spliced correctly, but the data can not be read from host side). Let me investigate it. Thank you, > --- > > v2: > - In open_pipe_on_cpu(), clean format of if statements. > - In tracing_open_pipe(), call tracing_get_cpu() after > tracing_check_open_get_tr() and within trace_types_lock, > also keep "int ret;" as the last declaration. > Link: https://lore.kernel.org/all/20230817101331.21ab6b33@gandalf.local.home/ > > v1: > - Link: https://lore.kernel.org/all/20230817115057.1637676-1-zhengyejian1@huawei.com/ > > kernel/trace/trace.c | 55 ++++++++++++++++++++++++++++++++++++++------ > kernel/trace/trace.h | 2 ++ > 2 files changed, 50 insertions(+), 7 deletions(-) > > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c > index b8870078ef58..c888a0a2c0e2 100644 > --- a/kernel/trace/trace.c > +++ b/kernel/trace/trace.c > @@ -6705,10 +6705,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, > > #endif > > +static int open_pipe_on_cpu(struct trace_array *tr, int cpu) > +{ > + if (cpu == RING_BUFFER_ALL_CPUS) { > + if (cpumask_empty(tr->pipe_cpumask)) { > + cpumask_setall(tr->pipe_cpumask); > + return 0; > + } > + } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { > + cpumask_set_cpu(cpu, tr->pipe_cpumask); > + return 0; > + } > + return -EBUSY; > +} > + > +static void close_pipe_on_cpu(struct trace_array *tr, int cpu) > +{ > + if (cpu == RING_BUFFER_ALL_CPUS) { > + WARN_ON(!cpumask_full(tr->pipe_cpumask)); > + cpumask_clear(tr->pipe_cpumask); > + } else { > + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); > + cpumask_clear_cpu(cpu, tr->pipe_cpumask); > + } > +} > + > static int tracing_open_pipe(struct inode *inode, struct file *filp) > { > struct trace_array *tr = inode->i_private; > struct trace_iterator *iter; > + int cpu; > int ret; > > ret = tracing_check_open_get_tr(tr); > @@ -6716,13 +6742,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) > return ret; > > mutex_lock(&trace_types_lock); > + cpu = tracing_get_cpu(inode); > + ret = open_pipe_on_cpu(tr, cpu); > + if (ret) > + goto fail_pipe_on_cpu; > > /* create a buffer to store the information to pass to userspace */ > iter = kzalloc(sizeof(*iter), GFP_KERNEL); > if (!iter) { > ret = -ENOMEM; > - __trace_array_put(tr); > - goto out; > + goto fail_alloc_iter; > } > > trace_seq_init(&iter->seq); > @@ -6745,7 +6774,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) > > iter->tr = tr; > iter->array_buffer = &tr->array_buffer; > - iter->cpu_file = tracing_get_cpu(inode); > + iter->cpu_file = cpu; > mutex_init(&iter->mutex); > filp->private_data = iter; > > @@ -6755,12 +6784,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) > nonseekable_open(inode, filp); > > tr->trace_ref++; > -out: > + > mutex_unlock(&trace_types_lock); > return ret; > > fail: > kfree(iter); > +fail_alloc_iter: > + close_pipe_on_cpu(tr, cpu); > +fail_pipe_on_cpu: > __trace_array_put(tr); > mutex_unlock(&trace_types_lock); > return ret; > @@ -6777,7 +6809,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) > > if (iter->trace->pipe_close) > iter->trace->pipe_close(iter); > - > + close_pipe_on_cpu(tr, iter->cpu_file); > mutex_unlock(&trace_types_lock); > > free_cpumask_var(iter->started); > @@ -9441,6 +9473,9 @@ static struct trace_array *trace_array_create(const char *name) > if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) > goto out_free_tr; > > + if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) > + goto out_free_tr; > + > tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; > > cpumask_copy(tr->tracing_cpumask, cpu_all_mask); > @@ -9482,6 +9517,7 @@ static struct trace_array *trace_array_create(const char *name) > out_free_tr: > ftrace_free_ftrace_ops(tr); > free_trace_buffers(tr); > + free_cpumask_var(tr->pipe_cpumask); > free_cpumask_var(tr->tracing_cpumask); > kfree(tr->name); > kfree(tr); > @@ -9584,6 +9620,7 @@ static int __remove_instance(struct trace_array *tr) > } > kfree(tr->topts); > > + free_cpumask_var(tr->pipe_cpumask); > free_cpumask_var(tr->tracing_cpumask); > kfree(tr->name); > kfree(tr); > @@ -10381,12 +10418,14 @@ __init static int tracer_alloc_buffers(void) > if (trace_create_savedcmd() < 0) > goto out_free_temp_buffer; > > + if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) > + goto out_free_savedcmd; > + > /* TODO: make the number of buffers hot pluggable with CPUS */ > if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { > MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); > - goto out_free_savedcmd; > + goto out_free_pipe_cpumask; > } > - > if (global_trace.buffer_disabled) > tracing_off(); > > @@ -10439,6 +10478,8 @@ __init static int tracer_alloc_buffers(void) > > return 0; > > +out_free_pipe_cpumask: > + free_cpumask_var(global_trace.pipe_cpumask); > out_free_savedcmd: > free_saved_cmdlines_buffer(savedcmd); > out_free_temp_buffer: > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h > index e1edc2197fc8..53ac0f7780c2 100644 > --- a/kernel/trace/trace.h > +++ b/kernel/trace/trace.h > @@ -377,6 +377,8 @@ struct trace_array { > struct list_head events; > struct trace_event_file *trace_marker_file; > cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ > + /* one per_cpu trace_pipe can be opened by only one user */ > + cpumask_var_t pipe_cpumask; > int ref; > int trace_ref; > #ifdef CONFIG_FUNCTION_TRACER > -- > 2.25.1 > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Fri, 18 Aug 2023 14:03:09 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > Looks good to me. > > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > BTW, this reminds me that splice read support had been introduced for > virtio-trace support. I've tested it with/without this patch, but I hit > a different issue, which seems like a virtio-serial issue (per-cpu buffers > on the guest are spliced correctly, but the data can not be read from host > side). Let me investigate it. Is virtio-trace using trace_pipe or trace_pipe_raw? Those two are handled differently. -- Steve
On Fri, 18 Aug 2023 09:41:28 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Fri, 18 Aug 2023 14:03:09 +0900 > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > Looks good to me. > > > > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > > > BTW, this reminds me that splice read support had been introduced for > > virtio-trace support. I've tested it with/without this patch, but I hit > > a different issue, which seems like a virtio-serial issue (per-cpu buffers > > on the guest are spliced correctly, but the data can not be read from host > > side). Let me investigate it. > > Is virtio-trace using trace_pipe or trace_pipe_raw? Those two are handled > differently. It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial) returns -1 and errno == EAGAIN, the trace data will be lost? Thank you, > > -- Steve -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Fri, 18 Aug 2023 23:23:01 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial) > returns -1 and errno == EAGAIN, the trace data will be lost? It shouldn't. If it does, then there's likely a bug. The code will block and if an interrupt comes in it will return immediately without reading from the buffer. https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262 I don't see where it would return -EINTR and consume data, but I may be missing something. -- Steve
On Fri, 18 Aug 2023 11:53:22 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Fri, 18 Aug 2023 23:23:01 +0900 > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial) > > returns -1 and errno == EAGAIN, the trace data will be lost? > > It shouldn't. If it does, then there's likely a bug. The code will block > and if an interrupt comes in it will return immediately without reading > from the buffer. > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262 > > I don't see where it would return -EINTR and consume data, but I may be > missing something. Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN. https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491 It seems not handling such case. Anyway, I also think something wrong in virtio-serial (or misusing?), since it can not read anything from the host sometimes. I just setup the virtio-trace with below patch (ignore EAGAIN). From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> Date: Thu, 17 Aug 2023 14:08:40 +0900 Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice() splice() can return EAGAIN error instead of returning 0 size read. In that case, wait a while and try to call splice() again. Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> --- tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c index ddfe7875eb16..e8a4c4f0c499 100644 --- a/tools/virtio/virtio-trace/trace-agent-rw.c +++ b/tools/virtio/virtio-trace/trace-agent-rw.c @@ -8,6 +8,7 @@ */ #define _GNU_SOURCE +#include <errno.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info) rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL, ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE); - if (rlen < 0) { - pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num); + if (rlen < 0 && errno != EAGAIN) { + pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num); goto error; - } else if (rlen == 0) { + } else if (rlen == 0 || errno == EAGAIN) { /* * If trace data do not exist or are unreadable not * for exceeding the page size, splice_read returns -- 2.34.1 -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Sat, 19 Aug 2023 10:42:57 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > >From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001 > From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> > Date: Thu, 17 Aug 2023 14:08:40 +0900 > Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice() > > splice() can return EAGAIN error instead of returning 0 size read. > In that case, wait a while and try to call splice() again. > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Bah, When I pulled in Zheng's patch from patchwork, it included the above signed-off-by. I was confused to why it added your SoB. Probably best if we had started this conversation on another thread and not hijack the discussion about this patch. -- Steve > --- > tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++--- > 1 file changed, 4 insertions(+), 3 deletions(-)
On Mon, 21 Aug 2023 11:17:25 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Sat, 19 Aug 2023 10:42:57 +0900 > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > >From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001 > > From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> > > Date: Thu, 17 Aug 2023 14:08:40 +0900 > > Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice() > > > > splice() can return EAGAIN error instead of returning 0 size read. > > In that case, wait a while and try to call splice() again. > > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > Bah, > > When I pulled in Zheng's patch from patchwork, it included the above > signed-off-by. I was confused to why it added your SoB. Oops, I didn't noticed such patchwork behavior. > > Probably best if we had started this conversation on another thread and > not hijack the discussion about this patch. Agreed. Sorry for confusion. > > -- Steve > > > > --- > > tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++--- > > 1 file changed, 4 insertions(+), 3 deletions(-) > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Sat, 19 Aug 2023 10:42:57 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > On Fri, 18 Aug 2023 11:53:22 -0400 > Steven Rostedt <rostedt@goodmis.org> wrote: > > > On Fri, 18 Aug 2023 23:23:01 +0900 > > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > > > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial) > > > returns -1 and errno == EAGAIN, the trace data will be lost? > > > > It shouldn't. If it does, then there's likely a bug. The code will block > > and if an interrupt comes in it will return immediately without reading > > from the buffer. > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262 > > > > I don't see where it would return -EINTR and consume data, but I may be > > missing something. > > Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN. > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491 > > It seems not handling such case. > > Anyway, I also think something wrong in virtio-serial (or misusing?), since > it can not read anything from the host sometimes. I just setup the virtio-trace > with below patch (ignore EAGAIN). Hmm, I couldn't reproduce it. (maybe a host security update change something?) Anyway, I confirmed that the ring buffer pages will not be consumed unless splice_to_pipe() succeeded. Thank you, > > > From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001 > From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> > Date: Thu, 17 Aug 2023 14:08:40 +0900 > Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice() > > splice() can return EAGAIN error instead of returning 0 size read. > In that case, wait a while and try to call splice() again. > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > --- > tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++--- > 1 file changed, 4 insertions(+), 3 deletions(-) > > diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c > index ddfe7875eb16..e8a4c4f0c499 100644 > --- a/tools/virtio/virtio-trace/trace-agent-rw.c > +++ b/tools/virtio/virtio-trace/trace-agent-rw.c > @@ -8,6 +8,7 @@ > */ > > #define _GNU_SOURCE > +#include <errno.h> > #include <fcntl.h> > #include <stdio.h> > #include <stdlib.h> > @@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info) > rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL, > ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE); > > - if (rlen < 0) { > - pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num); > + if (rlen < 0 && errno != EAGAIN) { > + pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num); > goto error; > - } else if (rlen == 0) { > + } else if (rlen == 0 || errno == EAGAIN) { > /* > * If trace data do not exist or are unreadable not > * for exceeding the page size, splice_read returns > -- > 2.34.1 > > > -- > Masami Hiramatsu (Google) <mhiramat@kernel.org> -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Sat, 12 Aug 2023 10:22:43 +0800 Zheng Yejian <zhengyejian1@huawei.com> wrote: > > And if it wasn't consumed? We just lost it? > > If 'iter->ent' was not consumed, trace_find_next_entry_inc() will find > it again, will it? No it will not, because it did the "inc". That means the next time it is called, it will find the next item to read, thinking it already "consumed" the last one. -- Steve
© 2016 - 2025 Red Hat, Inc.