From nobody Tue Jun 30 12:07:36 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id F2E34C433FE for ; Mon, 17 Jan 2022 16:49:25 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S237774AbiAQQrQ (ORCPT ); Mon, 17 Jan 2022 11:47:16 -0500 Received: from foss.arm.com ([217.140.110.172]:60630 "EHLO foss.arm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S237721AbiAQQrP (ORCPT ); Mon, 17 Jan 2022 11:47:15 -0500 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 19536101E; Mon, 17 Jan 2022 08:47:15 -0800 (PST) Received: from e113632-lin.cambridge.arm.com (e113632-lin.cambridge.arm.com [10.1.196.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id A5D9E3F766; Mon, 17 Jan 2022 08:47:12 -0800 (PST) From: Valentin Schneider To: linux-kernel@vger.kernel.org Cc: Abhijeet Dharmapurikar , =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= , Dietmar Eggemann , Steven Rostedt , Peter Zijlstra , Ingo Molnar , Vincent Guittot , Thomas Gleixner , Sebastian Andrzej Siewior , Juri Lelli , Daniel Bristot de Oliveira , Kees Cook , Andrew Morton , "Eric W. Biederman" , Alexey Gladkov , "Kenta.Tada@sony.com" , Randy Dunlap , Ed Tsai Subject: [PATCH v2 1/2] sched/tracing: Don't re-read p->state when emitting sched_switch event Date: Mon, 17 Jan 2022 16:46:32 +0000 Message-Id: <20220117164633.322550-2-valentin.schneider@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220117164633.322550-1-valentin.schneider@arm.com> References: <20220117164633.322550-1-valentin.schneider@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" As of commit c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") the following sequence becomes possible: p->__state =3D TASK_INTERRUPTIBLE; __schedule() deactivate_task(p); ttwu() READ !p->on_rq p->__state=3DTASK_WAKING trace_sched_switch() __trace_sched_switch_state() task_state_index() return 0; TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in the trace event. Prevent this by pushing the value read from __schedule() down the trace event. Reported-by: Abhijeet Dharmapurikar Signed-off-by: Valentin Schneider --- include/linux/sched.h | 11 ++++++++--- include/trace/events/sched.h | 11 +++++++---- kernel/sched/core.c | 4 ++-- kernel/trace/fgraph.c | 4 +++- kernel/trace/ftrace.c | 4 +++- kernel/trace/trace_events.c | 8 ++++++-- kernel/trace/trace_osnoise.c | 4 +++- kernel/trace/trace_sched_switch.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + 9 files changed, 34 insertions(+), 14 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d2e261adb8ea..d00837d12b9d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1616,10 +1616,10 @@ static inline pid_t task_pgrp_nr(struct task_struct= *tsk) #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) =20 -static inline unsigned int task_state_index(struct task_struct *tsk) +static inline unsigned int __task_state_index(unsigned int tsk_state, + unsigned int tsk_exit_state) { - unsigned int tsk_state =3D READ_ONCE(tsk->__state); - unsigned int state =3D (tsk_state | tsk->exit_state) & TASK_REPORT; + unsigned int state =3D (tsk_state | tsk_exit_state) & TASK_REPORT; =20 BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); =20 @@ -1629,6 +1629,11 @@ static inline unsigned int task_state_index(struct t= ask_struct *tsk) return fls(state); } =20 +static inline unsigned int task_state_index(struct task_struct *tsk) +{ + return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); +} + static inline char task_index_to_char(unsigned int state) { static const char state_char[] =3D "RSDTtXZPI"; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 94640482cfe7..65e786756321 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_ARGS(p)); =20 #ifdef CREATE_TRACE_POINTS -static inline long __trace_sched_switch_state(bool preempt, struct task_st= ruct *p) +static inline long __trace_sched_switch_state(bool preempt, + unsigned int prev_state, + struct task_struct *p) { unsigned int state; =20 @@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool pree= mpt, struct task_struct * * it for left shift operation to get the correct task->state * mapping. */ - state =3D task_state_index(p); + state =3D __task_state_index(prev_state, p->exit_state); =20 return state ? (1 << (state - 1)) : state; } @@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool pr= eempt, struct task_struct * TRACE_EVENT(sched_switch, =20 TP_PROTO(bool preempt, + unsigned int prev_state, struct task_struct *prev, struct task_struct *next), =20 - TP_ARGS(preempt, prev, next), + TP_ARGS(preempt, prev_state, prev, next), =20 TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) @@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch, memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid =3D prev->pid; __entry->prev_prio =3D prev->prio; - __entry->prev_state =3D __trace_sched_switch_state(preempt, prev); + __entry->prev_state =3D __trace_sched_switch_state(preempt, prev_state, = prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid =3D next->pid; __entry->next_prio =3D next->prio; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe53e510e711..a8799a2d8546 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4822,7 +4822,7 @@ static struct rq *finish_task_switch(struct task_stru= ct *prev) { struct rq *rq =3D this_rq(); struct mm_struct *mm =3D rq->prev_mm; - long prev_state; + unsigned int prev_state; =20 /* * The previous task will have left us with a preempt_count of 2 @@ -6287,7 +6287,7 @@ static void __sched notrace __schedule(unsigned int s= ched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); =20 - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); =20 /* Also unlocks the rq: */ rq =3D context_switch(rq, prev, next, &rf); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 22061d38fc00..19028e072cdb 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -415,7 +415,9 @@ static int alloc_retstack_tasklist(struct ftrace_ret_st= ack **ret_stack_list) =20 static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { unsigned long long timestamp; int index; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30bc880c3849..e296ddeec99f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7311,7 +7311,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *= ops) =20 static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr =3D data; struct trace_pid_list *pid_list; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4021b9a79f93..6ddc6cc0d5d5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bo= ol enable) =20 static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr =3D data; struct trace_pid_list *no_pid_list; @@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, boo= l preempt, =20 static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - struct task_struct *prev, struct task_struct *next) + unsigned int prev_state, + struct task_struct *prev, + struct task_struct *next) { struct trace_array *tr =3D data; struct trace_pid_list *no_pid_list; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 7520d43aed55..a8a2d17f858c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1168,7 +1168,9 @@ thread_exit(struct osnoise_variables *osn_var, struct= task_struct *t) * used to record the beginning and to report the end of a thread noise wi= ndow. */ static void -trace_sched_switch_callback(void *data, bool preempt, struct task_struct *= p, +trace_sched_switch_callback(void *data, bool preempt, + unsigned int prev_state, + struct task_struct *p, struct task_struct *n) { struct osnoise_variables *osn_var =3D this_cpu_osn_var(); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_s= witch.c index e304196d7c28..993b0ed10d8c 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex); =20 static void probe_sched_switch(void *ignore, bool preempt, + unsigned int prev_state, struct task_struct *prev, struct task_struct *next) { int flags; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_w= akeup.c index 2402de520eca..46429f9a96fa 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, =20 static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, + unsigned int prev_state, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; --=20 2.25.1 From nobody Tue Jun 30 12:07:36 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 2ED52C43217 for ; Mon, 17 Jan 2022 16:49:26 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S241140AbiAQQrU (ORCPT ); Mon, 17 Jan 2022 11:47:20 -0500 Received: from foss.arm.com ([217.140.110.172]:60660 "EHLO foss.arm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S237778AbiAQQrS (ORCPT ); Mon, 17 Jan 2022 11:47:18 -0500 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id B7FD11063; Mon, 17 Jan 2022 08:47:17 -0800 (PST) Received: from e113632-lin.cambridge.arm.com (e113632-lin.cambridge.arm.com [10.1.196.57]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 4FBA93F766; Mon, 17 Jan 2022 08:47:15 -0800 (PST) From: Valentin Schneider To: linux-kernel@vger.kernel.org Cc: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= , Steven Rostedt , Sebastian Andrzej Siewior , Abhijeet Dharmapurikar , Dietmar Eggemann , Peter Zijlstra , Ingo Molnar , Vincent Guittot , Thomas Gleixner , Juri Lelli , Daniel Bristot de Oliveira , Kees Cook , Andrew Morton , "Eric W. Biederman" , Alexey Gladkov , "Kenta.Tada@sony.com" , Randy Dunlap , Ed Tsai Subject: [PATCH v2 2/2] sched/tracing: Add TASK_RTLOCK_WAIT to TASK_REPORT Date: Mon, 17 Jan 2022 16:46:33 +0000 Message-Id: <20220117164633.322550-3-valentin.schneider@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220117164633.322550-1-valentin.schneider@arm.com> References: <20220117164633.322550-1-valentin.schneider@arm.com> MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org TASK_RTLOCK_WAIT currently isn't part of TASK_REPORT, thus a task blocking on an rtlock will appear as having a task state =3D=3D 0, IOW TASK_RUNNING. The actual state is saved in p->saved_state, but reading it after reading p->__state has a few issues: o that could still be TASK_RUNNING in the case of e.g. rt_spin_lock o ttwu_state_match() might have changed that to TASK_RUNNING Add TASK_RTLOCK_WAIT to TASK_REPORT. Reported-by: Uwe Kleine-K=C3=B6nig Signed-off-by: Valentin Schneider Reviewed-by: Steven Rostedt Reviewed-by: Sebastian Andrzej Siewior --- fs/proc/array.c | 3 ++- include/linux/sched.h | 17 +++++++++-------- include/trace/events/sched.h | 1 + 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index ff869a66b34e..f4cae65529a6 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -128,9 +128,10 @@ static const char * const task_state_array[] =3D { "X (dead)", /* 0x10 */ "Z (zombie)", /* 0x20 */ "P (parked)", /* 0x40 */ + "L (rt-locked)", /* 0x80 */ =20 /* states beyond TASK_REPORT: */ - "I (idle)", /* 0x80 */ + "I (idle)", /* 0x100 */ }; =20 static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index d00837d12b9d..18fd77578dae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -91,13 +91,14 @@ struct task_group; #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ #define TASK_PARKED 0x0040 -#define TASK_DEAD 0x0080 -#define TASK_WAKEKILL 0x0100 -#define TASK_WAKING 0x0200 -#define TASK_NOLOAD 0x0400 -#define TASK_NEW 0x0800 /* RT specific auxilliary flag to mark RT lock waiters */ -#define TASK_RTLOCK_WAIT 0x1000 +#define TASK_RTLOCK_WAIT 0x0080 + +#define TASK_DEAD 0x0100 +#define TASK_WAKEKILL 0x0200 +#define TASK_WAKING 0x0400 +#define TASK_NOLOAD 0x0800 +#define TASK_NEW 0x1000 #define TASK_STATE_MAX 0x2000 =20 /* Convenience macros for the sake of set_current_state: */ @@ -114,7 +115,7 @@ struct task_group; #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ - TASK_PARKED) + TASK_PARKED | TASK_RTLOCK_WAIT) =20 #define task_is_running(task) (READ_ONCE((task)->__state) =3D=3D TASK_RUN= NING) =20 @@ -1636,7 +1637,7 @@ static inline unsigned int task_state_index(struct ta= sk_struct *tsk) =20 static inline char task_index_to_char(unsigned int state) { - static const char state_char[] =3D "RSDTtXZPI"; + static const char state_char[] =3D "RSDTtXZPLI"; =20 BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) !=3D sizeof(state_char) - 1); =20 diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 65e786756321..f86ec9af19ff 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -261,6 +261,7 @@ TRACE_EVENT(sched_switch, { EXIT_DEAD, "X" }, { EXIT_ZOMBIE, "Z" }, { TASK_PARKED, "P" }, + { TASK_RTLOCK_WAIT, "L" }, { TASK_DEAD, "I" }) : "R", =20 --=20 2.25.1