From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BB752208BC for ; Tue, 19 Dec 2023 14:32:00 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 24EF2C433C7; Tue, 19 Dec 2023 14:32:00 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9X-00000003MDx-130V; Tue, 19 Dec 2023 09:32:59 -0500 Message-ID: <20231219143259.032310636@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:34 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Sean Paul , Arun Easi , Daniel Wagner , Dmytro Maluka Subject: [for-next][PATCH 01/10] tracing: Allow creating instances with specified system events References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" A trace instance may only need to enable specific events. As the eventfs directory of an instance currently creates all events which adds overhead, allow internal instances to be created with just the events in systems that they care about. This currently only deals with systems and not individual events, but this should bring down the overhead of creating instances for specific use cases quite bit. The trace_array_get_by_name() now has another parameter "systems". This parameter is a const string pointer of a comma/space separated list of event systems that should be created by the trace_array. (Note if the trace_array already exists, this parameter is ignored). The list of systems is saved and if a module is loaded, its events will not be added unless the system for those events also match the systems string. Link: https://lore.kernel.org/linux-trace-kernel/20231213093701.03fddec0@ga= ndalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Sean Paul Cc: Arun Easi Cc: Daniel Wagner Tested-by: Dmytro Maluka Signed-off-by: Steven Rostedt (Google) --- drivers/scsi/qla2xxx/qla_os.c | 2 +- include/linux/trace.h | 4 +-- kernel/trace/trace.c | 23 +++++++++++--- kernel/trace/trace.h | 1 + kernel/trace/trace_boot.c | 2 +- kernel/trace/trace_events.c | 48 +++++++++++++++++++++++++++-- samples/ftrace/sample-trace-array.c | 2 +- 7 files changed, 70 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 03348f605c2e..dd674378f2f3 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -2889,7 +2889,7 @@ static void qla2x00_iocb_work_fn(struct work_struct *= work) static void qla_trace_init(void) { - qla_trc_array =3D trace_array_get_by_name("qla2xxx"); + qla_trc_array =3D trace_array_get_by_name("qla2xxx", NULL); if (!qla_trc_array) { ql_log(ql_log_fatal, NULL, 0x0001, "Unable to create qla2xxx trace instance, instance logging will b= e disabled.\n"); diff --git a/include/linux/trace.h b/include/linux/trace.h index 2a70a447184c..fdcd76b7be83 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned l= ong ip, const char *fmt, ...); int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); -struct trace_array *trace_array_get_by_name(const char *name); +struct trace_array *trace_array_get_by_name(const char *name, const char *= systems); int trace_array_destroy(struct trace_array *tr); =20 /* For osnoise tracer */ @@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_ar= ray *tr) static inline void trace_array_put(struct trace_array *tr) { } -static inline struct trace_array *trace_array_get_by_name(const char *name) +static inline struct trace_array *trace_array_get_by_name(const char *name= , const char *systems) { return NULL; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 199df497db07..59e39b652afb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9490,7 +9490,8 @@ static int trace_array_create_dir(struct trace_array = *tr) return ret; } =20 -static struct trace_array *trace_array_create(const char *name) +static struct trace_array * +trace_array_create_systems(const char *name, const char *systems) { struct trace_array *tr; int ret; @@ -9510,6 +9511,12 @@ static struct trace_array *trace_array_create(const = char *name) if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; =20 + if (systems) { + tr->system_names =3D kstrdup_const(systems, GFP_KERNEL); + if (!tr->system_names) + goto out_free_tr; + } + tr->trace_flags =3D global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; =20 cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9556,12 +9563,18 @@ static struct trace_array *trace_array_create(const= char *name) free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); =20 return ERR_PTR(ret); } =20 +static struct trace_array *trace_array_create(const char *name) +{ + return trace_array_create_systems(name, NULL); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -9587,6 +9600,7 @@ static int instance_mkdir(const char *name) /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. + * @systems: A list of systems to create event directories for (NULL for a= ll) * * Returns pointer to trace array with given name. * NULL, if it cannot be created. @@ -9600,7 +9614,7 @@ static int instance_mkdir(const char *name) * trace_array_put() is called, user space can not delete it. * */ -struct trace_array *trace_array_get_by_name(const char *name) +struct trace_array *trace_array_get_by_name(const char *name, const char *= systems) { struct trace_array *tr; =20 @@ -9612,7 +9626,7 @@ struct trace_array *trace_array_get_by_name(const cha= r *name) goto out_unlock; } =20 - tr =3D trace_array_create(name); + tr =3D trace_array_create_systems(name, systems); =20 if (IS_ERR(tr)) tr =3D NULL; @@ -9659,6 +9673,7 @@ static int __remove_instance(struct trace_array *tr) =20 free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); =20 @@ -10377,7 +10392,7 @@ __init static void enable_instances(void) if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) do_allocate_snapshot(tok); =20 - tr =3D trace_array_get_by_name(tok); + tr =3D trace_array_get_by_name(tok, NULL); if (!tr) { pr_warn("Failed to create instance buffer %s\n", curr_str); continue; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0489e72c8169..79180aed13ee 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -377,6 +377,7 @@ struct trace_array { unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + const char *system_names; struct list_head err_log; struct dentry *dir; struct dentry *options; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7ccc7a8e155b..dbe29b4c6a7a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node) if (!p || *p =3D=3D '\0') continue; =20 - tr =3D trace_array_get_by_name(p); + tr =3D trace_array_get_by_name(p, NULL); if (!tr) { pr_err("Failed to get trace instance %s\n", p); continue; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f29e815ca5b2..b70d03818038 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2896,6 +2896,27 @@ void trace_event_eval_update(struct trace_eval_map *= *map, int len) up_write(&trace_event_sem); } =20 +static bool event_in_systems(struct trace_event_call *call, + const char *systems) +{ + const char *system; + const char *p; + + if (!systems) + return true; + + system =3D call->class->system; + p =3D strstr(systems, system); + if (!p) + return false; + + if (p !=3D systems && !isspace(*(p - 1)) && *(p - 1) !=3D ',') + return false; + + p +=3D strlen(system); + return !*p || isspace(*p) || *p =3D=3D ','; +} + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2905,9 +2926,12 @@ trace_create_new_event(struct trace_event_call *call, struct trace_event_file *file; unsigned int first; =20 + if (!event_in_systems(call, tr->system_names)) + return NULL; + file =3D kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) - return NULL; + return ERR_PTR(-ENOMEM); =20 pid_list =3D rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); @@ -2972,8 +2996,17 @@ __trace_add_new_event(struct trace_event_call *call,= struct trace_array *tr) struct trace_event_file *file; =20 file =3D trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); =20 if (eventdir_initialized) return event_create_dir(tr->event_dir, file); @@ -3012,8 +3045,17 @@ __trace_early_add_new_event(struct trace_event_call = *call, int ret; =20 file =3D trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); =20 ret =3D event_define_fields(call); if (ret) diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-tr= ace-array.c index 6aba02a31c96..d0ee9001c7b3 100644 --- a/samples/ftrace/sample-trace-array.c +++ b/samples/ftrace/sample-trace-array.c @@ -105,7 +105,7 @@ static int __init sample_trace_array_init(void) * NOTE: This function increments the reference counter * associated with the trace array - "tr". */ - tr =3D trace_array_get_by_name("sample-instance"); + tr =3D trace_array_get_by_name("sample-instance", "sched,timer,kprobes"); =20 if (!tr) return -1; --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DE627208D0 for ; Tue, 19 Dec 2023 14:32:00 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7B376C433CA; Tue, 19 Dec 2023 14:32:00 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9X-00000003MER-2HJA; Tue, 19 Dec 2023 09:32:59 -0500 Message-ID: <20231219143259.319621961@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:35 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton Subject: [for-next][PATCH 02/10] ring-buffer: Add offset of events in dump on mismatch References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" On bugs that have the ring buffer timestamp get out of sync, the config CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS, that checks for it and if it is detected it causes a dump of the bad sub buffer. It shows each event and their timestamp as well as the delta in the event. But it's also good to see the offset into the subbuffer for that event to know if how close to the end it is. Also print where the last event actually ended compared to where it was expected to end. Link: https://lore.kernel.org/linux-trace-kernel/20231211131623.59eaebd2@ga= ndalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 83eab547f1d1..bfe2697a92ee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3358,29 +3358,34 @@ static void dump_buffer_page(struct buffer_data_pag= e *bpage, case RINGBUF_TYPE_TIME_EXTEND: delta =3D rb_event_time_stamp(event); ts +=3D delta; - pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", + e, ts, delta); break; =20 case RINGBUF_TYPE_TIME_STAMP: delta =3D rb_event_time_stamp(event); ts =3D rb_fix_abs_ts(delta, ts); - pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", + e, ts, delta); break; =20 case RINGBUF_TYPE_PADDING: ts +=3D event->time_delta; - pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", + e, ts, event->time_delta); break; =20 case RINGBUF_TYPE_DATA: ts +=3D event->time_delta; - pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d\n", + e, ts, event->time_delta); break; =20 default: break; } } + pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e= ); } =20 static DEFINE_PER_CPU(atomic_t, checking); --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4A45A20DC2 for ; Tue, 19 Dec 2023 14:32:00 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id C5BE1C433BA; Tue, 19 Dec 2023 14:32:00 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9X-00000003MEv-3Tx5; Tue, 19 Dec 2023 09:32:59 -0500 Message-ID: <20231219143259.611969166@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:36 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton Subject: [for-next][PATCH 03/10] tracing: Allow for max buffer data size trace_marker writes References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" Allow a trace write to be as big as the ring buffer tracing data will allow. Currently, it only allows writes of 1KB in size, but there's no reason that it cannot allow what the ring buffer can hold. Link: https://lore.kernel.org/linux-trace-kernel/20231212131901.5f501e72@ga= ndalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 15 +++++++++++++++ kernel/trace/trace.c | 31 ++++++++++++++++++++++++------- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 782e14f62201..b1b03b2c0f08 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *ite= r); bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter); =20 unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu); +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer); =20 void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bfe2697a92ee..16b640d824f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5190,6 +5190,21 @@ unsigned long ring_buffer_size(struct trace_buffer *= buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_size); =20 +/** + * ring_buffer_max_event_size - return the max data size of an event + * @buffer: The ring buffer. + * + * Returns the maximum size an event can be. + */ +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) +{ + /* If abs timestamp is requested, events have a timestamp too */ + if (ring_buffer_time_stamp_abs(buffer)) + return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND; + return BUF_MAX_DATA_SIZE; +} +EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); + static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59e39b652afb..dba1328e454b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7278,8 +7278,9 @@ tracing_mark_write(struct file *filp, const char __us= er *ubuf, enum event_trigger_type tt =3D ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; + int meta_size; ssize_t written; - int size; + size_t size; int len; =20 /* Used in tracing_mark_raw_write() as well */ @@ -7292,12 +7293,12 @@ tracing_mark_write(struct file *filp, const char __= user *ubuf, if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; =20 - if (cnt > TRACE_BUF_SIZE) - cnt =3D TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >=3D PAGE_SIZE); + if ((ssize_t)cnt < 0) + return -EINVAL; =20 - size =3D sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + meta_size =3D sizeof(*entry) + 2; /* add '\0' and possible '\n' */ + again: + size =3D cnt + meta_size; =20 /* If less than "", then make sure we can still add that */ if (cnt < FAULTED_SIZE) @@ -7306,9 +7307,25 @@ tracing_mark_write(struct file *filp, const char __u= ser *ubuf, buffer =3D tr->array_buffer.buffer; event =3D __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * If the size was greater than what was allowed, then + * make it smaller and try again. + */ + if (size > ring_buffer_max_event_size(buffer)) { + /* cnt < FAULTED size should never be bigger than max */ + if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) + return -EBADF; + cnt =3D ring_buffer_max_event_size(buffer) - meta_size; + /* The above should only happen once */ + if (WARN_ON_ONCE(cnt + meta_size =3D=3D size)) + return -EBADF; + goto again; + } + /* Ring buffer disabled, return as if not open for write */ return -EBADF; + } =20 entry =3D ring_buffer_event_data(event); entry->ip =3D _THIS_IP_; --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9FBE420DDC for ; Tue, 19 Dec 2023 14:32:01 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 0FC71C433CC; Tue, 19 Dec 2023 14:32:01 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9Y-00000003MFP-0VSl; Tue, 19 Dec 2023 09:33:00 -0500 Message-ID: <20231219143259.902317852@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:37 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton Subject: [for-next][PATCH 04/10] trace_seq: Increase the buffer size to almost two pages References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" Now that trace_marker can hold more than 1KB string, and can write as much as the ring buffer can hold, the trace_seq is not big enough to hold writes: ~# a=3D"1234567890" ~# cnt=3D4080 ~# s=3D"" ~# while [ $cnt -gt 10 ]; do ~# s=3D"${s}${a}" ~# cnt=3D$((cnt-10)) ~# done ~# echo $s > trace_marker ~# cat trace # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:8 # # _-----=3D> irqs-off/BH-disabled # / _----=3D> need-resched # | / _---=3D> hardirq/softirq # || / _--=3D> preempt-depth # ||| / _-=3D> migrate-disable # |||| / delay # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | <...>-860 [002] ..... 105.543465: tracing_mark_write[LINE= TOO BIG] <...>-860 [002] ..... 105.543496: tracing_mark_write: 789= 012345678901234567890 By increasing the trace_seq buffer to almost two pages, it can now print out the first line. This also subtracts the rest of the trace_seq fields from the buffer, so that the entire trace_seq is now PAGE_SIZE aligned. Link: https://lore.kernel.org/linux-trace-kernel/20231209175220.19867af4@ga= ndalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_seq.h | 9 ++++++--- kernel/trace/trace.c | 6 +++--- kernel/trace/trace_seq.c | 3 --- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 3691e0e76a1a..9ec229dfddaa 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -8,11 +8,14 @@ =20 /* * Trace sequences are used to allow a function to call several other func= tions - * to create a string of data to use (up to a max of PAGE_SIZE). + * to create a string of data to use. */ =20 +#define TRACE_SEQ_BUFFER_SIZE (PAGE_SIZE * 2 - \ + (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) + struct trace_seq { - char buffer[PAGE_SIZE]; + char buffer[TRACE_SEQ_BUFFER_SIZE]; struct seq_buf seq; size_t readpos; int full; @@ -21,7 +24,7 @@ struct trace_seq { static inline void trace_seq_init(struct trace_seq *s) { - seq_buf_init(&s->seq, s->buffer, PAGE_SIZE); + seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE); s->full =3D 0; s->readpos =3D 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dba1328e454b..0be30cccabb4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3753,7 +3753,7 @@ static bool trace_safe_str(struct trace_iterator *ite= r, const char *str, =20 /* OK if part of the temp seq buffer */ if ((addr >=3D (unsigned long)iter->tmp_seq.buffer) && - (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE)) return true; =20 /* Core rodata can not be freed */ @@ -6932,8 +6932,8 @@ tracing_read_pipe(struct file *filp, char __user *ubu= f, goto out; } =20 - if (cnt >=3D PAGE_SIZE) - cnt =3D PAGE_SIZE - 1; + if (cnt >=3D TRACE_SEQ_BUFFER_SIZE) + cnt =3D TRACE_SEQ_BUFFER_SIZE - 1; =20 /* reset all but tr, trace, and overruns */ trace_iterator_reset(iter); diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 7be97229ddf8..c158d65a8a88 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -13,9 +13,6 @@ * trace_seq_init() more than once to reset the trace_seq to start * from scratch. *=20 - * The buffer size is currently PAGE_SIZE, although it may become dynamic - * in the future. - * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7284E20DCF for ; Tue, 19 Dec 2023 14:32:01 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 5BB32C433CB; Tue, 19 Dec 2023 14:32:01 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9Y-00000003MFu-1jlQ; Tue, 19 Dec 2023 09:33:00 -0500 Message-ID: <20231219143300.196456990@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:38 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton Subject: [for-next][PATCH 05/10] tracing: Have trace_marker break up by lines by size of trace_seq References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" If a trace_marker write is bigger than what trace_seq can hold, then it will print "LINE TOO BIG" message and not what was written. Instead, check if the write is bigger than the trace_seq and break it up by that size. Ideally, we could make the trace_seq dynamic that could hold this. But that's for another time. Link: https://lore.kernel.org/linux-trace-kernel/20231212190422.1eaf224f@ga= ndalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0be30cccabb4..9cf58383d2fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7304,6 +7304,11 @@ tracing_mark_write(struct file *filp, const char __u= ser *ubuf, if (cnt < FAULTED_SIZE) size +=3D FAULTED_SIZE - cnt; =20 + if (size > TRACE_SEQ_BUFFER_SIZE) { + cnt -=3D size - TRACE_SEQ_BUFFER_SIZE; + goto again; + } + buffer =3D tr->array_buffer.buffer; event =3D __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AD2D220DE3 for ; Tue, 19 Dec 2023 14:32:01 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 9052BC433CA; Tue, 19 Dec 2023 14:32:01 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9Y-00000003MGP-2xw1; Tue, 19 Dec 2023 09:33:00 -0500 Message-ID: <20231219143300.487404467@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:39 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton Subject: [for-next][PATCH 06/10] tracing: Increase size of trace_marker_raw to max ring buffer entry References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" There's no reason to give an arbitrary limit to the size of a raw trace marker. Just let it be as big as the size that is allowed by the ring buffer itself. And there's also no reason to artificially break up the write to TRACE_BUF_SIZE, as that's not even used. Link: https://lore.kernel.org/linux-trace-kernel/20231213104218.2efc70c1@ga= ndalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9cf58383d2fb..55dabee4c78b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7365,9 +7365,6 @@ tracing_mark_write(struct file *filp, const char __us= er *ubuf, return written; } =20 -/* Limit it for now to 3K (including tag) */ -#define RAW_DATA_MAX_SIZE (1024*3) - static ssize_t tracing_mark_raw_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7389,19 +7386,18 @@ tracing_mark_raw_write(struct file *filp, const cha= r __user *ubuf, return -EINVAL; =20 /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + if (cnt < sizeof(unsigned int)) return -EINVAL; =20 - if (cnt > TRACE_BUF_SIZE) - cnt =3D TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >=3D PAGE_SIZE); - size =3D sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size +=3D FAULT_SIZE_ID - cnt; =20 buffer =3D tr->array_buffer.buffer; + + if (size > ring_buffer_max_event_size(buffer)) + return -EINVAL; + event =3D __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, tracing_gen_ctx()); if (!event) --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 710D6225D2 for ; Tue, 19 Dec 2023 14:32:02 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id E14B1C433CC; Tue, 19 Dec 2023 14:32:01 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9Y-00000003MGu-4C2R; Tue, 19 Dec 2023 09:33:01 -0500 Message-ID: <20231219143300.777936212@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:40 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Shuah Khan Subject: [for-next][PATCH 07/10] tracing/selftests: Add test to test the trace_marker References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" Add a test that writes longs strings, some over the size of the sub buffer and make sure that the entire content is there. Link: https://lore.kernel.org/linux-trace-kernel/20231213111527.6a4c9b58@ga= ndalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Acked-by: Shuah Khan Signed-off-by: Steven Rostedt (Google) --- .../ftrace/test.d/00basic/trace_marker.tc | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100755 tools/testing/selftests/ftrace/test.d/00basic/trace_mar= ker.tc diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc = b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc new file mode 100755 index 000000000000..9aa0db2b84fc --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc @@ -0,0 +1,82 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Basic tests on writing to trace_marker +# requires: trace_marker +# flags: instance + +get_buffer_data_size() { + sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_buffer_data_offset() { + sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page +} + +get_event_header_size() { + type_len=3D`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' event= s/header_event` + time_len=3D`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' eve= nts/header_event` + array_len=3D`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/= header_event` + total_bits=3D$((type_len+time_len+array_len)) + total_bits=3D$((total_bits+7)) + echo $((total_bits/8)) +} + +get_print_event_buf_offset() { + sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/for= mat +} + +event_header_size=3D`get_event_header_size` +print_header_size=3D`get_print_event_buf_offset` + +data_offset=3D`get_buffer_data_offset` + +marker_meta=3D$((event_header_size+print_header_size)) + +make_str() { + cnt=3D$1 + # subtract two for \n\0 as marker adds these + cnt=3D$((cnt-2)) + printf -- 'X%.0s' $(seq $cnt) +} + +write_buffer() { + size=3D$1 + + str=3D`make_str $size` + + # clear the buffer + echo > trace + + # write the string into the marker + echo -n $str > trace_marker + + echo $str +} + +test_buffer() { + + size=3D`get_buffer_data_size` + oneline_size=3D$((size-marker_meta)) + echo size =3D $size + echo meta size =3D $marker_meta + + # Now add a little more the meta data overhead will overflow + + str=3D`write_buffer $size` + + # Make sure the line was broken + new_str=3D`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,""= );printf "%s", $0; exit}' trace` + + if [ "$new_str" =3D "$str" ]; then + exit fail; + fi + + # Make sure the entire line can be found + new_str=3D`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,""= );printf "%s", $0; }' trace` + + if [ "$new_str" !=3D "$str" ]; then + exit fail; + fi +} + +test_buffer --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 162342D7A6 for ; Tue, 19 Dec 2023 14:32:02 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 37B11C433C7; Tue, 19 Dec 2023 14:32:02 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9Z-00000003MHS-1GQ9; Tue, 19 Dec 2023 09:33:01 -0500 Message-ID: <20231219143301.069856775@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:41 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , Linus Torvalds Subject: [for-next][PATCH 08/10] ring-buffer: Remove 32bit timestamp logic References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" Each event has a 27 bit timestamp delta that is used to hold the delta from the last event. If the time between events is greater than 2^27, then a timestamp is added that holds a 59 bit absolute timestamp. Until a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp"), if an interrupt interrupted an event in progress, all the events delta would be zero to not deal with the races that need to be handled. The commit a389d86f7fd09 changed that to handle the races giving all events, even those that preempt other events, still have an accurate timestamp. To handle those races requires performing 64-bit cmpxchg on the timestamps. But doing 64-bit cmpxchg on 32-bit architectures is considered very slow. To try to deal with this the timestamp logic was broken into two and then three 32-bit cmpxchgs, with the thought that two (or three) 32-bit cmpxchgs are still faster than a single 64-bit cmpxchg on 32-bit architectures. Part of the problem with this is that I didn't have any 32-bit architectures to test on. After hitting several subtle bugs in this code, an effort was made to try and see if three 32-bit cmpxchgs are indeed faster than a single 64-bit. After a few people brushed off the dust of their old 32-bit machines, tests were done, and even though 32-bit cmpxchg was faster than a single 64-bit, it was in the order of 50% at best, not 300%. After some more refactoring of the code, all 4 64-bit cmpxchg were removed: https://lore.kernel.org/linux-trace-kernel/20231211114420.36dde01b@gandalf= .local.home https://lore.kernel.org/linux-trace-kernel/20231214222921.193037a7@gandalf= .local.home https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorscha= ch.local.home https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf= .local.home/ With all the 64-bit cmpxchg removed, the complex 32-bit workaround can also= be removed. The 32-bit and 64-bit logic is now exactly the same. Link: https://lore.kernel.org/all/20231213214632.15047c40@gandalf.local.hom= e/ Link: https://lore.kernel.org/linux-trace-kernel/20231219074303.28f9abda@ga= ndalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Linus Torvalds Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 176 ++----------------------------------- 1 file changed, 9 insertions(+), 167 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 16b640d824f9..a4ada4f303c4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,6 +27,7 @@ #include #include =20 +#include #include =20 /* @@ -463,27 +464,9 @@ enum { RB_CTX_MAX }; =20 -#if BITS_PER_LONG =3D=3D 32 -#define RB_TIME_32 -#endif - -/* To test on 64 bit machines */ -//#define RB_TIME_32 - -#ifdef RB_TIME_32 - -struct rb_time_struct { - local_t cnt; - local_t top; - local_t bottom; - local_t msb; -}; -#else -#include struct rb_time_struct { local64_t time; }; -#endif typedef struct rb_time_struct rb_time_t; =20 #define MAX_NEST 5 @@ -573,147 +556,14 @@ struct ring_buffer_iter { int missed_events; }; =20 -#ifdef RB_TIME_32 - -/* - * On 32 bit machines, local64_t is very expensive. As the ring - * buffer doesn't need all the features of a true 64 bit atomic, - * on 32 bit, it uses these functions (64 still uses local64_t). - * - * For the ring buffer, 64 bit required operations for the time is - * the following: - * - * - Reads may fail if it interrupted a modification of the time stamp. - * It will succeed if it did not interrupt another write even if - * the read itself is interrupted by a write. - * It returns whether it was successful or not. - * - * - Writes always succeed and will overwrite other writes and writes - * that were done by events interrupting the current write. - * - * - A write followed by a read of the same time stamp will always succee= d, - * but may not contain the same value. - * - * - A cmpxchg will fail if it interrupted another write or cmpxchg. - * Other than that, it acts like a normal cmpxchg. - * - * The 60 bit time stamp is broken up by 30 bits in a top and bottom half - * (bottom being the least significant 30 bits of the 60 bit time stamp). - * - * The two most significant bits of each half holds a 2 bit counter (0-3). - * Each update will increment this counter by one. - * When reading the top and bottom, if the two counter bits match then the - * top and bottom together make a valid 60 bit number. - */ -#define RB_TIME_SHIFT 30 -#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) -#define RB_TIME_MSB_SHIFT 60 - -static inline int rb_time_cnt(unsigned long val) -{ - return (val >> RB_TIME_SHIFT) & 3; -} - -static inline u64 rb_time_val(unsigned long top, unsigned long bottom) -{ - u64 val; - - val =3D top & RB_TIME_VAL_MASK; - val <<=3D RB_TIME_SHIFT; - val |=3D bottom & RB_TIME_VAL_MASK; - - return val; -} - -static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *c= nt) -{ - unsigned long top, bottom, msb; - unsigned long c; - - /* - * If the read is interrupted by a write, then the cnt will - * be different. Loop until both top and bottom have been read - * without interruption. - */ - do { - c =3D local_read(&t->cnt); - top =3D local_read(&t->top); - bottom =3D local_read(&t->bottom); - msb =3D local_read(&t->msb); - } while (c !=3D local_read(&t->cnt)); - - *cnt =3D rb_time_cnt(top); - - /* If top, msb or bottom counts don't match, this interrupted a write */ - if (*cnt !=3D rb_time_cnt(msb) || *cnt !=3D rb_time_cnt(bottom)) - return false; - - /* The shift to msb will lose its cnt bits */ - *ret =3D rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); - return true; -} - -static bool rb_time_read(rb_time_t *t, u64 *ret) -{ - unsigned long cnt; - - return __rb_time_read(t, ret, &cnt); -} - -static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned lo= ng cnt) -{ - return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); -} - -static inline void rb_time_split(u64 val, unsigned long *top, unsigned lon= g *bottom, - unsigned long *msb) -{ - *top =3D (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); - *bottom =3D (unsigned long)(val & RB_TIME_VAL_MASK); - *msb =3D (unsigned long)(val >> RB_TIME_MSB_SHIFT); -} - -static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned= long cnt) -{ - val =3D rb_time_val_cnt(val, cnt); - local_set(t, val); -} - -static void rb_time_set(rb_time_t *t, u64 val) -{ - unsigned long cnt, top, bottom, msb; - - rb_time_split(val, &top, &bottom, &msb); - - /* Writes always succeed with a valid number even if it gets interrupted.= */ - do { - cnt =3D local_inc_return(&t->cnt); - rb_time_val_set(&t->top, top, cnt); - rb_time_val_set(&t->bottom, bottom, cnt); - rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); - } while (cnt !=3D local_read(&t->cnt)); -} - -static inline bool -rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) -{ - return local_try_cmpxchg(l, &expect, set); -} - -#else /* 64 bits */ - -/* local64_t always succeeds */ - -static inline bool rb_time_read(rb_time_t *t, u64 *ret) +static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret =3D local64_read(&t->time); - return true; } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } -#endif =20 /* * Enable this to make sure that the event passed to @@ -820,10 +670,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *= buffer, WARN_ONCE(1, "nest (%d) greater than max", nest); =20 fail: - /* Can only fail on 32 bit */ - if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) - /* Screw it, just read the current time */ - ts =3D rb_time_stamp(cpu_buffer->buffer); + rb_time_read(&cpu_buffer->write_stamp, &ts); =20 return ts; } @@ -2820,7 +2667,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_bu= ffer, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, - (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_sta= mp) ? write_stamp : 0), + (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_st= amp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -3497,16 +3344,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_b= uffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; - bool a_ok; - bool b_ok; =20 /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page =3D info->tail_page =3D READ_ONCE(cpu_buffer->tail_page); =20 /*A*/ w =3D local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - b_ok =3D rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok =3D rb_time_read(&cpu_buffer->write_stamp, &info->after); + rb_time_read(&cpu_buffer->before_stamp, &info->before); + rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts =3D rb_time_stamp(cpu_buffer->buffer); =20 @@ -3521,7 +3366,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buf= fer, if (!w) { /* Use the sub-buffer timestamp */ info->delta =3D 0; - } else if (unlikely(!a_ok || !b_ok || info->before !=3D info->after)) { + } else if (unlikely(info->before !=3D info->after)) { info->add_timestamp |=3D RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length +=3D RB_LEN_TIME_EXTEND; } else { @@ -3570,8 +3415,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buf= fer, /* SLOW PATH - Interrupted between A and C */ =20 /* Save the old before_stamp */ - a_ok =3D rb_time_read(&cpu_buffer->before_stamp, &info->before); - RB_WARN_ON(cpu_buffer, !a_ok); + rb_time_read(&cpu_buffer->before_stamp, &info->before); =20 /* * Read a new timestamp and update the before_stamp to make @@ -3583,9 +3427,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buf= fer, rb_time_set(&cpu_buffer->before_stamp, ts); =20 barrier(); - /*E*/ a_ok =3D rb_time_read(&cpu_buffer->write_stamp, &info->after); - /* Was interrupted before here, write_stamp must be valid */ - RB_WARN_ON(cpu_buffer, !a_ok); + /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); /*F*/ if (write =3D=3D (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after =3D=3D info->before && info->after < ts) { --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9420F22EF2 for ; Tue, 19 Dec 2023 14:32:02 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7BD20C433CA; Tue, 19 Dec 2023 14:32:02 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9Z-00000003MHw-2T5v; Tue, 19 Dec 2023 09:33:01 -0500 Message-ID: <20231219143301.369885334@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:42 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton , "Joel Fernandes (Google)" Subject: [for-next][PATCH 09/10] ring-buffer: Add interrupt information to dump of data sub-buffer References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" When the ring buffer timestamp verifier triggers, it dumps the content of the sub-buffer. But currently it only dumps the timestamps and the offset of the data as well as the deltas. It would be even more informative if the event data also showed the interrupt context level it was in. That is, if each event showed that the event was written in normal, softirq, irq or NMI context. Then a better idea about how the events may have been interrupted from each other. As the payload of the ring buffer is really a black box of the ring buffer, just assume that if the payload is larger than a trace entry, that it is a trace entry. As trace entries have the interrupt context information saved in a flags field, look at that location and report the output of the flags. If the payload is not a trace entry, there's no way to really know, and the information will be garbage. But that's OK, because this is for debugging only (this output is not used in production as the buffer check that calls it causes a huge overhead to the tracing). This information, when available, is crucial for debugging timestamp issues. If it's garbage, it will also be pretty obvious that its garbage too. As this output usually happens in kselftests of the tracing code, the user will know what the payload is at the time. Link: https://lore.kernel.org/linux-trace-kernel/20231219074542.6f304601@ga= ndalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Suggested-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 79 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a4ada4f303c4..c0cc45482e1e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3185,6 +3185,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); #define CHECK_FULL_PAGE 1L =20 #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS + +static const char *show_irq_str(int bits) +{ + const char *type[] =3D { + ".", // 0 + "s", // 1 + "h", // 2 + "Hs", // 3 + "n", // 4 + "Ns", // 5 + "Nh", // 6 + "NHs", // 7 + }; + + return type[bits]; +} + +/* Assume this is an trace event */ +static const char *show_flags(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + int bits =3D 0; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return "X"; + + entry =3D ring_buffer_event_data(event); + + if (entry->flags & TRACE_FLAG_SOFTIRQ) + bits |=3D 1; + + if (entry->flags & TRACE_FLAG_HARDIRQ) + bits |=3D 2; + + if (entry->flags & TRACE_FLAG_NMI) + bits |=3D 4; + + return show_irq_str(bits); +} + +static const char *show_irq(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return ""; + + entry =3D ring_buffer_event_data(event); + if (entry->flags & TRACE_FLAG_IRQS_OFF) + return "d"; + return ""; +} + +static const char *show_interrupt_level(void) +{ + unsigned long pc =3D preempt_count(); + unsigned char level =3D 0; + + if (pc & SOFTIRQ_OFFSET) + level |=3D 1; + + if (pc & HARDIRQ_MASK) + level |=3D 2; + + if (pc & NMI_MASK) + level |=3D 4; + + return show_irq_str(level); +} + static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) @@ -3224,8 +3294,9 @@ static void dump_buffer_page(struct buffer_data_page = *bpage, =20 case RINGBUF_TYPE_DATA: ts +=3D event->time_delta; - pr_warn(" 0x%x: [%lld] delta:%d\n", - e, ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", + e, ts, event->time_delta, + show_flags(event), show_irq(event)); break; =20 default: @@ -3316,11 +3387,11 @@ static void check_buffer(struct ring_buffer_per_cpu= *cpu_buffer, atomic_inc(&cpu_buffer->record_disabled); /* There's some cases in boot up that this can happen */ WARN_ON_ONCE(system_state !=3D SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%l= ld before:%lld after:%lld%s\n", + pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%l= ld before:%lld after:%lld%s context:%s\n", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, - full ? " (full)" : ""); + full ? " (full)" : "", show_interrupt_level()); dump_buffer_page(bpage, info, tail); atomic_dec(&ts_dump); /* Do not re-enable checking */ --=20 2.42.0 From nobody Fri Dec 19 11:04:11 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D95992629C for ; Tue, 19 Dec 2023 14:32:02 +0000 (UTC) Received: by smtp.kernel.org (Postfix) with ESMTPSA id BDDF6C433C9; Tue, 19 Dec 2023 14:32:02 +0000 (UTC) Received: from rostedt by gandalf with local (Exim 4.97) (envelope-from ) id 1rFb9Z-00000003MIS-3kKA; Tue, 19 Dec 2023 09:33:01 -0500 Message-ID: <20231219143301.661736978@goodmis.org> User-Agent: quilt/0.67 Date: Tue, 19 Dec 2023 09:32:43 -0500 From: Steven Rostedt To: linux-kernel@vger.kernel.org Cc: Masami Hiramatsu , Mark Rutland , Mathieu Desnoyers , Andrew Morton Subject: [for-next][PATCH 10/10] ring-buffer: Check if absolute timestamp goes backwards References: <20231219143233.204534014@goodmis.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Steven Rostedt (Google)" The check_buffer() which checks the timestamps of the ring buffer sub-buffer page, when enabled, only checks if the adding of deltas of the events from the last absolute timestamp or the timestamp of the sub-buffer page adds up to the current event. What it does not check is if the absolute timestamp causes the time of the events to go backwards, as that can cause issues elsewhere. Test for the timestamp going backwards too. This also fixes a slight issue where if the warning triggers at boot up (because of the resetting of the tsc), it will disable all further checks, even those that are after boot Have it continue checking if the warning was ignored during boot up. Link: https://lore.kernel.org/linux-trace-kernel/20231219074732.18b092d4@ga= ndalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 46 +++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0cc45482e1e..f7dc74e45ebf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3309,6 +3309,23 @@ static void dump_buffer_page(struct buffer_data_page= *bpage, static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; =20 +#define buffer_warn_return(fmt, ...) \ + do { \ + /* If another report is happening, ignore this one */ \ + if (atomic_inc_return(&ts_dump) !=3D 1) { \ + atomic_dec(&ts_dump); \ + goto out; \ + } \ + atomic_inc(&cpu_buffer->record_disabled); \ + pr_warn(fmt, ##__VA_ARGS__); \ + dump_buffer_page(bpage, info, tail); \ + atomic_dec(&ts_dump); \ + /* There's some cases in boot up that this can happen */ \ + if (WARN_ON_ONCE(system_state !=3D SYSTEM_BOOTING)) \ + /* Do not re-enable checking */ \ + return; \ + } while (0) + /* * Check if the current event time stamp matches the deltas on * the buffer page. @@ -3362,7 +3379,12 @@ static void check_buffer(struct ring_buffer_per_cpu = *cpu_buffer, =20 case RINGBUF_TYPE_TIME_STAMP: delta =3D rb_event_time_stamp(event); - ts =3D rb_fix_abs_ts(delta, ts); + delta =3D rb_fix_abs_ts(delta, ts); + if (delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %l= ld absolute ts: %lld\n", + cpu_buffer->cpu, ts, delta); + } + ts =3D delta; break; =20 case RINGBUF_TYPE_PADDING: @@ -3379,23 +3401,11 @@ static void check_buffer(struct ring_buffer_per_cpu= *cpu_buffer, } if ((full && ts > info->ts) || (!full && ts + info->delta !=3D info->ts)) { - /* If another report is happening, ignore this one */ - if (atomic_inc_return(&ts_dump) !=3D 1) { - atomic_dec(&ts_dump); - goto out; - } - atomic_inc(&cpu_buffer->record_disabled); - /* There's some cases in boot up that this can happen */ - WARN_ON_ONCE(system_state !=3D SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%l= ld before:%lld after:%lld%s context:%s\n", - cpu_buffer->cpu, - ts + info->delta, info->ts, info->delta, - info->before, info->after, - full ? " (full)" : "", show_interrupt_level()); - dump_buffer_page(bpage, info, tail); - atomic_dec(&ts_dump); - /* Do not re-enable checking */ - return; + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%l= ld delta:%lld before:%lld after:%lld%s context:%s\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, + info->before, info->after, + full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); --=20 2.42.0