kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------ kernel/trace/trace.h | 2 ++ 2 files changed, 51 insertions(+), 7 deletions(-)
There is race issue when concurrently splice_read main trace_pipe and
per_cpu trace_pipes which will result in data read out being different
from what actually writen.
As suggested by Steven:
> I believe we should add a ref count to trace_pipe and the per_cpu
> trace_pipes, where if they are opened, nothing else can read it.
>
> Opening trace_pipe locks all per_cpu ref counts, if any of them are
> open, then the trace_pipe open will fail (and releases any ref counts
> it had taken).
>
> Opening a per_cpu trace_pipe will up the ref count for just that
> CPU buffer. This will allow multiple tasks to read different per_cpu
> trace_pipe files, but will prevent the main trace_pipe file from
> being opened.
But because we only need to know whether per_cpu trace_pipe is open or
not, using a cpumask instead of using ref count may be easier.
After this patch, users will find that:
- Main trace_pipe can be opened by only one user, and if it is
opened, all per_cpu trace_pipes cannot be opened;
- Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
trace_pipe can only be opened by one user. And if one of them is
opened, main trace_pipe cannot be opened.
Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
---
> Does that work for this?
Hi, Steven, how about using a cpumask instead of ref count?
This patch will also prevent main trace_pipe or anyone per_cpu trace_pipe
from being opened by multiple people at the same time.
kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------
kernel/trace/trace.h | 2 ++
2 files changed, 51 insertions(+), 7 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..73f6f4d10d43 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6705,24 +6705,54 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
#endif
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else {
+ if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
int ret;
+ int cpu = tracing_get_cpu(inode);
ret = tracing_check_open_get_tr(tr);
if (ret)
return ret;
mutex_lock(&trace_types_lock);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}
trace_seq_init(&iter->seq);
@@ -6745,7 +6775,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6755,12 +6785,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
tr->trace_ref++;
-out:
+
mutex_unlock(&trace_types_lock);
return ret;
fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6777,7 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
free_cpumask_var(iter->started);
@@ -9441,6 +9474,9 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9482,6 +9518,7 @@ static struct trace_array *trace_array_create(const char *name)
out_free_tr:
ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -9584,6 +9621,7 @@ static int __remove_instance(struct trace_array *tr)
}
kfree(tr->topts);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -10381,12 +10419,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
+ if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();
@@ -10439,6 +10479,8 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..53ac0f7780c2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@ struct trace_array {
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
--
2.25.1
On Thu, 17 Aug 2023 19:50:57 +0800
Zheng Yejian <zhengyejian1@huawei.com> wrote:
> There is race issue when concurrently splice_read main trace_pipe and
> per_cpu trace_pipes which will result in data read out being different
> from what actually writen.
>
> As suggested by Steven:
> > I believe we should add a ref count to trace_pipe and the per_cpu
> > trace_pipes, where if they are opened, nothing else can read it.
> >
> > Opening trace_pipe locks all per_cpu ref counts, if any of them are
> > open, then the trace_pipe open will fail (and releases any ref counts
> > it had taken).
> >
> > Opening a per_cpu trace_pipe will up the ref count for just that
> > CPU buffer. This will allow multiple tasks to read different per_cpu
> > trace_pipe files, but will prevent the main trace_pipe file from
> > being opened.
>
> But because we only need to know whether per_cpu trace_pipe is open or
> not, using a cpumask instead of using ref count may be easier.
>
> After this patch, users will find that:
> - Main trace_pipe can be opened by only one user, and if it is
> opened, all per_cpu trace_pipes cannot be opened;
> - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
> trace_pipe can only be opened by one user. And if one of them is
> opened, main trace_pipe cannot be opened.
>
> Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
> ---
>
> > Does that work for this?
>
> Hi, Steven, how about using a cpumask instead of ref count?
> This patch will also prevent main trace_pipe or anyone per_cpu trace_pipe
> from being opened by multiple people at the same time.
I'm fine with the CPU mask.
>
> kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------
> kernel/trace/trace.h | 2 ++
> 2 files changed, 51 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index b8870078ef58..73f6f4d10d43 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -6705,24 +6705,54 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
>
> #endif
>
> +static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
> +{
> + if (cpu == RING_BUFFER_ALL_CPUS) {
> + if (cpumask_empty(tr->pipe_cpumask)) {
> + cpumask_setall(tr->pipe_cpumask);
> + return 0;
> + }
> + } else {
> + if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
> + cpumask_set_cpu(cpu, tr->pipe_cpumask);
> + return 0;
> + }
> + }
> + return -EBUSY;
The above would look nicer if you had that be } else if () {
if (cpu == RING_BUFFER_ALL_CPUS) {
if (cpumask_empty(tr->pipe_cpumask)) {
cpumask_setall(tr->pipe_cpumask);
return 0;
}
} else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
cpumask_set_cpu(cpu, tr->pipe_cpumask);
return 0;
}
return -EBUSY;
> +}
> +
> +static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
> +{
> + if (cpu == RING_BUFFER_ALL_CPUS) {
> + WARN_ON(!cpumask_full(tr->pipe_cpumask));
> + cpumask_clear(tr->pipe_cpumask);
> + } else {
> + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
> + cpumask_clear_cpu(cpu, tr->pipe_cpumask);
> + }
> +}
> +
> static int tracing_open_pipe(struct inode *inode, struct file *filp)
> {
> struct trace_array *tr = inode->i_private;
> struct trace_iterator *iter;
> int ret;
> + int cpu = tracing_get_cpu(inode);
tracing_get_cpu() must be called after tracing_check_open_get_tr(),
otherwise it may not be reliable.
Also, keep "int ret;" as the last declaration.
-- Steve
>
> ret = tracing_check_open_get_tr(tr);
> if (ret)
> return ret;
>
> mutex_lock(&trace_types_lock);
> + ret = open_pipe_on_cpu(tr, cpu);
> + if (ret)
> + goto fail_pipe_on_cpu;
>
> /* create a buffer to store the information to pass to userspace */
> iter = kzalloc(sizeof(*iter), GFP_KERNEL);
> if (!iter) {
> ret = -ENOMEM;
> - __trace_array_put(tr);
> - goto out;
> + goto fail_alloc_iter;
> }
>
> trace_seq_init(&iter->seq);
> @@ -6745,7 +6775,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>
> iter->tr = tr;
> iter->array_buffer = &tr->array_buffer;
> - iter->cpu_file = tracing_get_cpu(inode);
> + iter->cpu_file = cpu;
> mutex_init(&iter->mutex);
> filp->private_data = iter;
>
> @@ -6755,12 +6785,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
> nonseekable_open(inode, filp);
>
> tr->trace_ref++;
> -out:
> +
> mutex_unlock(&trace_types_lock);
> return ret;
>
> fail:
> kfree(iter);
> +fail_alloc_iter:
> + close_pipe_on_cpu(tr, cpu);
> +fail_pipe_on_cpu:
> __trace_array_put(tr);
> mutex_unlock(&trace_types_lock);
> return ret;
> @@ -6777,7 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
>
> if (iter->trace->pipe_close)
> iter->trace->pipe_close(iter);
> -
> + close_pipe_on_cpu(tr, iter->cpu_file);
> mutex_unlock(&trace_types_lock);
>
> free_cpumask_var(iter->started);
> @@ -9441,6 +9474,9 @@ static struct trace_array *trace_array_create(const char *name)
> if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
> goto out_free_tr;
>
> + if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
> + goto out_free_tr;
> +
> tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
>
> cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
> @@ -9482,6 +9518,7 @@ static struct trace_array *trace_array_create(const char *name)
> out_free_tr:
> ftrace_free_ftrace_ops(tr);
> free_trace_buffers(tr);
> + free_cpumask_var(tr->pipe_cpumask);
> free_cpumask_var(tr->tracing_cpumask);
> kfree(tr->name);
> kfree(tr);
> @@ -9584,6 +9621,7 @@ static int __remove_instance(struct trace_array *tr)
> }
> kfree(tr->topts);
>
> + free_cpumask_var(tr->pipe_cpumask);
> free_cpumask_var(tr->tracing_cpumask);
> kfree(tr->name);
> kfree(tr);
> @@ -10381,12 +10419,14 @@ __init static int tracer_alloc_buffers(void)
> if (trace_create_savedcmd() < 0)
> goto out_free_temp_buffer;
>
> + if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
> + goto out_free_savedcmd;
> +
> /* TODO: make the number of buffers hot pluggable with CPUS */
> if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
> MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
> - goto out_free_savedcmd;
> + goto out_free_pipe_cpumask;
> }
> -
> if (global_trace.buffer_disabled)
> tracing_off();
>
> @@ -10439,6 +10479,8 @@ __init static int tracer_alloc_buffers(void)
>
> return 0;
>
> +out_free_pipe_cpumask:
> + free_cpumask_var(global_trace.pipe_cpumask);
> out_free_savedcmd:
> free_saved_cmdlines_buffer(savedcmd);
> out_free_temp_buffer:
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index e1edc2197fc8..53ac0f7780c2 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -377,6 +377,8 @@ struct trace_array {
> struct list_head events;
> struct trace_event_file *trace_marker_file;
> cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
> + /* one per_cpu trace_pipe can be opened by only one user */
> + cpumask_var_t pipe_cpumask;
> int ref;
> int trace_ref;
> #ifdef CONFIG_FUNCTION_TRACER
There is race issue when concurrently splice_read main trace_pipe and
per_cpu trace_pipes which will result in data read out being different
from what actually writen.
As suggested by Steven:
> I believe we should add a ref count to trace_pipe and the per_cpu
> trace_pipes, where if they are opened, nothing else can read it.
>
> Opening trace_pipe locks all per_cpu ref counts, if any of them are
> open, then the trace_pipe open will fail (and releases any ref counts
> it had taken).
>
> Opening a per_cpu trace_pipe will up the ref count for just that
> CPU buffer. This will allow multiple tasks to read different per_cpu
> trace_pipe files, but will prevent the main trace_pipe file from
> being opened.
But because we only need to know whether per_cpu trace_pipe is open or
not, using a cpumask instead of using ref count may be easier.
After this patch, users will find that:
- Main trace_pipe can be opened by only one user, and if it is
opened, all per_cpu trace_pipes cannot be opened;
- Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
trace_pipe can only be opened by one user. And if one of them is
opened, main trace_pipe cannot be opened.
Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
---
v2:
- In open_pipe_on_cpu(), clean format of if statements.
- In tracing_open_pipe(), call tracing_get_cpu() after
tracing_check_open_get_tr() and within trace_types_lock,
also keep "int ret;" as the last declaration.
Link: https://lore.kernel.org/all/20230817101331.21ab6b33@gandalf.local.home/
v1:
- Link: https://lore.kernel.org/all/20230817115057.1637676-1-zhengyejian1@huawei.com/
kernel/trace/trace.c | 55 ++++++++++++++++++++++++++++++++++++++------
kernel/trace/trace.h | 2 ++
2 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..c888a0a2c0e2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6705,10 +6705,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
#endif
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
+ int cpu;
int ret;
ret = tracing_check_open_get_tr(tr);
@@ -6716,13 +6742,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
return ret;
mutex_lock(&trace_types_lock);
+ cpu = tracing_get_cpu(inode);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}
trace_seq_init(&iter->seq);
@@ -6745,7 +6774,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6755,12 +6784,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
tr->trace_ref++;
-out:
+
mutex_unlock(&trace_types_lock);
return ret;
fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6777,7 +6809,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
free_cpumask_var(iter->started);
@@ -9441,6 +9473,9 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9482,6 +9517,7 @@ static struct trace_array *trace_array_create(const char *name)
out_free_tr:
ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -9584,6 +9620,7 @@ static int __remove_instance(struct trace_array *tr)
}
kfree(tr->topts);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -10381,12 +10418,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
+ if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();
@@ -10439,6 +10478,8 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..53ac0f7780c2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@ struct trace_array {
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
--
2.25.1
On Fri, 18 Aug 2023 10:26:45 +0800
Zheng Yejian <zhengyejian1@huawei.com> wrote:
> There is race issue when concurrently splice_read main trace_pipe and
> per_cpu trace_pipes which will result in data read out being different
> from what actually writen.
>
> As suggested by Steven:
> > I believe we should add a ref count to trace_pipe and the per_cpu
> > trace_pipes, where if they are opened, nothing else can read it.
> >
> > Opening trace_pipe locks all per_cpu ref counts, if any of them are
> > open, then the trace_pipe open will fail (and releases any ref counts
> > it had taken).
> >
> > Opening a per_cpu trace_pipe will up the ref count for just that
> > CPU buffer. This will allow multiple tasks to read different per_cpu
> > trace_pipe files, but will prevent the main trace_pipe file from
> > being opened.
>
> But because we only need to know whether per_cpu trace_pipe is open or
> not, using a cpumask instead of using ref count may be easier.
>
> After this patch, users will find that:
> - Main trace_pipe can be opened by only one user, and if it is
> opened, all per_cpu trace_pipes cannot be opened;
> - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
> trace_pipe can only be opened by one user. And if one of them is
> opened, main trace_pipe cannot be opened.
>
> Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Looks good to me.
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
BTW, this reminds me that splice read support had been introduced for
virtio-trace support. I've tested it with/without this patch, but I hit
a different issue, which seems like a virtio-serial issue (per-cpu buffers
on the guest are spliced correctly, but the data can not be read from host
side). Let me investigate it.
Thank you,
> ---
>
> v2:
> - In open_pipe_on_cpu(), clean format of if statements.
> - In tracing_open_pipe(), call tracing_get_cpu() after
> tracing_check_open_get_tr() and within trace_types_lock,
> also keep "int ret;" as the last declaration.
> Link: https://lore.kernel.org/all/20230817101331.21ab6b33@gandalf.local.home/
>
> v1:
> - Link: https://lore.kernel.org/all/20230817115057.1637676-1-zhengyejian1@huawei.com/
>
> kernel/trace/trace.c | 55 ++++++++++++++++++++++++++++++++++++++------
> kernel/trace/trace.h | 2 ++
> 2 files changed, 50 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index b8870078ef58..c888a0a2c0e2 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -6705,10 +6705,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
>
> #endif
>
> +static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
> +{
> + if (cpu == RING_BUFFER_ALL_CPUS) {
> + if (cpumask_empty(tr->pipe_cpumask)) {
> + cpumask_setall(tr->pipe_cpumask);
> + return 0;
> + }
> + } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
> + cpumask_set_cpu(cpu, tr->pipe_cpumask);
> + return 0;
> + }
> + return -EBUSY;
> +}
> +
> +static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
> +{
> + if (cpu == RING_BUFFER_ALL_CPUS) {
> + WARN_ON(!cpumask_full(tr->pipe_cpumask));
> + cpumask_clear(tr->pipe_cpumask);
> + } else {
> + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
> + cpumask_clear_cpu(cpu, tr->pipe_cpumask);
> + }
> +}
> +
> static int tracing_open_pipe(struct inode *inode, struct file *filp)
> {
> struct trace_array *tr = inode->i_private;
> struct trace_iterator *iter;
> + int cpu;
> int ret;
>
> ret = tracing_check_open_get_tr(tr);
> @@ -6716,13 +6742,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
> return ret;
>
> mutex_lock(&trace_types_lock);
> + cpu = tracing_get_cpu(inode);
> + ret = open_pipe_on_cpu(tr, cpu);
> + if (ret)
> + goto fail_pipe_on_cpu;
>
> /* create a buffer to store the information to pass to userspace */
> iter = kzalloc(sizeof(*iter), GFP_KERNEL);
> if (!iter) {
> ret = -ENOMEM;
> - __trace_array_put(tr);
> - goto out;
> + goto fail_alloc_iter;
> }
>
> trace_seq_init(&iter->seq);
> @@ -6745,7 +6774,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>
> iter->tr = tr;
> iter->array_buffer = &tr->array_buffer;
> - iter->cpu_file = tracing_get_cpu(inode);
> + iter->cpu_file = cpu;
> mutex_init(&iter->mutex);
> filp->private_data = iter;
>
> @@ -6755,12 +6784,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
> nonseekable_open(inode, filp);
>
> tr->trace_ref++;
> -out:
> +
> mutex_unlock(&trace_types_lock);
> return ret;
>
> fail:
> kfree(iter);
> +fail_alloc_iter:
> + close_pipe_on_cpu(tr, cpu);
> +fail_pipe_on_cpu:
> __trace_array_put(tr);
> mutex_unlock(&trace_types_lock);
> return ret;
> @@ -6777,7 +6809,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
>
> if (iter->trace->pipe_close)
> iter->trace->pipe_close(iter);
> -
> + close_pipe_on_cpu(tr, iter->cpu_file);
> mutex_unlock(&trace_types_lock);
>
> free_cpumask_var(iter->started);
> @@ -9441,6 +9473,9 @@ static struct trace_array *trace_array_create(const char *name)
> if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
> goto out_free_tr;
>
> + if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
> + goto out_free_tr;
> +
> tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
>
> cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
> @@ -9482,6 +9517,7 @@ static struct trace_array *trace_array_create(const char *name)
> out_free_tr:
> ftrace_free_ftrace_ops(tr);
> free_trace_buffers(tr);
> + free_cpumask_var(tr->pipe_cpumask);
> free_cpumask_var(tr->tracing_cpumask);
> kfree(tr->name);
> kfree(tr);
> @@ -9584,6 +9620,7 @@ static int __remove_instance(struct trace_array *tr)
> }
> kfree(tr->topts);
>
> + free_cpumask_var(tr->pipe_cpumask);
> free_cpumask_var(tr->tracing_cpumask);
> kfree(tr->name);
> kfree(tr);
> @@ -10381,12 +10418,14 @@ __init static int tracer_alloc_buffers(void)
> if (trace_create_savedcmd() < 0)
> goto out_free_temp_buffer;
>
> + if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
> + goto out_free_savedcmd;
> +
> /* TODO: make the number of buffers hot pluggable with CPUS */
> if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
> MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
> - goto out_free_savedcmd;
> + goto out_free_pipe_cpumask;
> }
> -
> if (global_trace.buffer_disabled)
> tracing_off();
>
> @@ -10439,6 +10478,8 @@ __init static int tracer_alloc_buffers(void)
>
> return 0;
>
> +out_free_pipe_cpumask:
> + free_cpumask_var(global_trace.pipe_cpumask);
> out_free_savedcmd:
> free_saved_cmdlines_buffer(savedcmd);
> out_free_temp_buffer:
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index e1edc2197fc8..53ac0f7780c2 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -377,6 +377,8 @@ struct trace_array {
> struct list_head events;
> struct trace_event_file *trace_marker_file;
> cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
> + /* one per_cpu trace_pipe can be opened by only one user */
> + cpumask_var_t pipe_cpumask;
> int ref;
> int trace_ref;
> #ifdef CONFIG_FUNCTION_TRACER
> --
> 2.25.1
>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Fri, 18 Aug 2023 14:03:09 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > Looks good to me. > > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > BTW, this reminds me that splice read support had been introduced for > virtio-trace support. I've tested it with/without this patch, but I hit > a different issue, which seems like a virtio-serial issue (per-cpu buffers > on the guest are spliced correctly, but the data can not be read from host > side). Let me investigate it. Is virtio-trace using trace_pipe or trace_pipe_raw? Those two are handled differently. -- Steve
On Fri, 18 Aug 2023 09:41:28 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Fri, 18 Aug 2023 14:03:09 +0900 > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > Looks good to me. > > > > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > > > BTW, this reminds me that splice read support had been introduced for > > virtio-trace support. I've tested it with/without this patch, but I hit > > a different issue, which seems like a virtio-serial issue (per-cpu buffers > > on the guest are spliced correctly, but the data can not be read from host > > side). Let me investigate it. > > Is virtio-trace using trace_pipe or trace_pipe_raw? Those two are handled > differently. It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial) returns -1 and errno == EAGAIN, the trace data will be lost? Thank you, > > -- Steve -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Fri, 18 Aug 2023 23:23:01 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial) > returns -1 and errno == EAGAIN, the trace data will be lost? It shouldn't. If it does, then there's likely a bug. The code will block and if an interrupt comes in it will return immediately without reading from the buffer. https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262 I don't see where it would return -EINTR and consume data, but I may be missing something. -- Steve
On Fri, 18 Aug 2023 11:53:22 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Fri, 18 Aug 2023 23:23:01 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
>
> > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> > returns -1 and errno == EAGAIN, the trace data will be lost?
>
> It shouldn't. If it does, then there's likely a bug. The code will block
> and if an interrupt comes in it will return immediately without reading
> from the buffer.
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262
>
> I don't see where it would return -EINTR and consume data, but I may be
> missing something.
Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491
It seems not handling such case.
Anyway, I also think something wrong in virtio-serial (or misusing?), since
it can not read anything from the host sometimes. I just setup the virtio-trace
with below patch (ignore EAGAIN).
From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 17 Aug 2023 14:08:40 +0900
Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
splice() can return EAGAIN error instead of returning 0 size read.
In that case, wait a while and try to call splice() again.
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
index ddfe7875eb16..e8a4c4f0c499 100644
--- a/tools/virtio/virtio-trace/trace-agent-rw.c
+++ b/tools/virtio/virtio-trace/trace-agent-rw.c
@@ -8,6 +8,7 @@
*/
#define _GNU_SOURCE
+#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
@@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info)
rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
- if (rlen < 0) {
- pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
+ if (rlen < 0 && errno != EAGAIN) {
+ pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num);
goto error;
- } else if (rlen == 0) {
+ } else if (rlen == 0 || errno == EAGAIN) {
/*
* If trace data do not exist or are unreadable not
* for exceeding the page size, splice_read returns
--
2.34.1
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Sat, 19 Aug 2023 10:42:57 +0900 Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > >From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001 > From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> > Date: Thu, 17 Aug 2023 14:08:40 +0900 > Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice() > > splice() can return EAGAIN error instead of returning 0 size read. > In that case, wait a while and try to call splice() again. > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Bah, When I pulled in Zheng's patch from patchwork, it included the above signed-off-by. I was confused to why it added your SoB. Probably best if we had started this conversation on another thread and not hijack the discussion about this patch. -- Steve > --- > tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++--- > 1 file changed, 4 insertions(+), 3 deletions(-)
On Mon, 21 Aug 2023 11:17:25 -0400 Steven Rostedt <rostedt@goodmis.org> wrote: > On Sat, 19 Aug 2023 10:42:57 +0900 > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote: > > > >From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001 > > From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> > > Date: Thu, 17 Aug 2023 14:08:40 +0900 > > Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice() > > > > splice() can return EAGAIN error instead of returning 0 size read. > > In that case, wait a while and try to call splice() again. > > > > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > Bah, > > When I pulled in Zheng's patch from patchwork, it included the above > signed-off-by. I was confused to why it added your SoB. Oops, I didn't noticed such patchwork behavior. > > Probably best if we had started this conversation on another thread and > not hijack the discussion about this patch. Agreed. Sorry for confusion. > > -- Steve > > > > --- > > tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++--- > > 1 file changed, 4 insertions(+), 3 deletions(-) > -- Masami Hiramatsu (Google) <mhiramat@kernel.org>
On Sat, 19 Aug 2023 10:42:57 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> On Fri, 18 Aug 2023 11:53:22 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > On Fri, 18 Aug 2023 23:23:01 +0900
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> >
> > > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> > > returns -1 and errno == EAGAIN, the trace data will be lost?
> >
> > It shouldn't. If it does, then there's likely a bug. The code will block
> > and if an interrupt comes in it will return immediately without reading
> > from the buffer.
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262
> >
> > I don't see where it would return -EINTR and consume data, but I may be
> > missing something.
>
> Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN.
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491
>
> It seems not handling such case.
>
> Anyway, I also think something wrong in virtio-serial (or misusing?), since
> it can not read anything from the host sometimes. I just setup the virtio-trace
> with below patch (ignore EAGAIN).
Hmm, I couldn't reproduce it. (maybe a host security update change something?)
Anyway, I confirmed that the ring buffer pages will not be consumed unless
splice_to_pipe() succeeded.
Thank you,
>
>
> From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001
> From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> Date: Thu, 17 Aug 2023 14:08:40 +0900
> Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
>
> splice() can return EAGAIN error instead of returning 0 size read.
> In that case, wait a while and try to call splice() again.
>
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
> tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
> index ddfe7875eb16..e8a4c4f0c499 100644
> --- a/tools/virtio/virtio-trace/trace-agent-rw.c
> +++ b/tools/virtio/virtio-trace/trace-agent-rw.c
> @@ -8,6 +8,7 @@
> */
>
> #define _GNU_SOURCE
> +#include <errno.h>
> #include <fcntl.h>
> #include <stdio.h>
> #include <stdlib.h>
> @@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info)
> rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
> ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
>
> - if (rlen < 0) {
> - pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
> + if (rlen < 0 && errno != EAGAIN) {
> + pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num);
> goto error;
> - } else if (rlen == 0) {
> + } else if (rlen == 0 || errno == EAGAIN) {
> /*
> * If trace data do not exist or are unreadable not
> * for exceeding the page size, splice_read returns
> --
> 2.34.1
>
>
> --
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
© 2016 - 2025 Red Hat, Inc.